예제 #1
0
파일: utils.py 프로젝트: pnmoralesh/Devito
def derive_parameters(iet, drop_locals=False):
    """
    Derive all input parameters (function call arguments) from an IET
    by collecting all symbols not defined in the tree itself.
    """
    # Pick all free symbols and symbolic functions from the kernel
    functions = FindSymbols('symbolics').visit(iet)
    free_symbols = FindSymbols('free-symbols').visit(iet)

    # Filter out function base symbols and use real function objects
    function_names = set(flatten([(s.name, s._C_name) for s in functions]))
    symbols = [s for s in free_symbols if s.name not in function_names]
    symbols = functions + symbols

    defines = [s.name for s in FindSymbols('defines').visit(iet)]
    parameters = tuple(s for s in symbols if s.name not in defines)

    # Drop globally-visible objects
    parameters = [p for p in parameters if not isinstance(p, (Literal, Macro))]

    # Maybe filter out all other compiler-generated objects
    if drop_locals:
        parameters = [
            p for p in parameters if not isinstance(p, (Array, LocalObject))
        ]

    return parameters
예제 #2
0
def derive_parameters(nodes, drop_locals=False):
    """
    Derive all input parameters (function call arguments) from an IET
    by collecting all symbols not defined in the tree itself.
    """
    # Pick all free symbols and symbolic functions from the kernel
    functions = FindSymbols('symbolics').visit(nodes)
    free_symbols = FindSymbols('free-symbols').visit(nodes)

    # Filter out function base symbols and use real function objects
    function_names = [s.name for s in functions]
    symbols = [s for s in free_symbols if s.name not in function_names]
    symbols = functions + symbols

    defines = [s.name for s in FindSymbols('defines').visit(nodes)]
    parameters = tuple(s for s in symbols if s.name not in defines)

    # Drop globally-visible objects
    parameters = [p for p in parameters if not isinstance(p, Macro)]

    # Filter out locally-allocated Arrays and Objects
    if drop_locals:
        parameters = [
            p for p in parameters
            if not (isinstance(p, Array) and (p._mem_heap or p._mem_stack))
        ]
        parameters = [p for p in parameters if not isinstance(p, LocalObject)]

    return parameters
예제 #3
0
    def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests):
        """Test construction of ElementalFunctions."""
        exprs = list(as_tuple(exprs))

        grid = Grid(shape=(10, 10))
        t = grid.stepping_dim  # noqa
        x, y = grid.dimensions  # noqa

        u = Function(name='u', grid=grid)  # noqa
        v = TimeFunction(name='v', grid=grid)  # noqa

        # List comprehension would need explicit locals/globals mappings to eval
        for i, e in enumerate(list(exprs)):
            exprs[i] = eval(e)

        op = Operator(exprs)

        # We create one ElementalFunction for each Iteration nest over space dimensions
        efuncs = []
        for n, tree in enumerate(retrieve_iteration_tree(op)):
            root = filter_iterations(tree,
                                     key=lambda i: i.dim.is_Space,
                                     stop='asap')
            efuncs.append(make_efunc('f%d' % n, root))

        assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests)

        for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests):
            # Check the `efunc` parameters
            assert all(i in efunc.parameters
                       for i in (x.symbolic_min, x.symbolic_max))
            assert all(i in efunc.parameters
                       for i in (y.symbolic_min, y.symbolic_max))
            functions = FindSymbols().visit(efunc)
            assert len(functions) == nf
            assert all(i in efunc.parameters for i in functions)
            timeiters = [
                i for i in FindSymbols('free-symbols').visit(efunc)
                if i.is_Dimension and i.is_Time
            ]
            assert len(timeiters) == nt
            assert all(i in efunc.parameters for i in timeiters)
            assert len(efunc.parameters) == 4 + len(functions) + len(timeiters)

            # Check there's exactly one ArrayCast for each Function
            assert len(FindNodes(ArrayCast).visit(efunc)) == nf

            # Check the loop nest structure
            trees = retrieve_iteration_tree(efunc)
            assert len(trees) == 1
            tree = trees[0]
            assert all(i.dim.name == j for i, j in zip(tree, nest))

            assert efunc.make_call()
예제 #4
0
    def _simdize(self, nodes, state):
        """
        Add compiler-specific or, if not available, OpenMP pragmas to the
        Iteration/Expression tree to emit SIMD-friendly code.
        """
        ignore_deps = as_tuple(self._compiler_decoration('ignore-deps'))

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            for i in vector_iterations:
                handle = FindSymbols('symbolics').visit(i)
                try:
                    aligned = [
                        j for j in handle if j.is_Tensor and j.shape[-1] %
                        get_simd_items(j.dtype) == 0
                    ]
                except KeyError:
                    aligned = []
                if aligned:
                    simd = Ompizer.lang['simd-for-aligned']
                    simd = as_tuple(
                        simd(','.join([j.name for j in aligned]),
                             simdinfo[get_simd_flag()]))
                else:
                    simd = as_tuple(Ompizer.lang['simd-for'])
                mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd)

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
예제 #5
0
def is_on_device(maybe_symbol, gpu_fit, only_writes=False):
    """
    True if all given Functions are allocated in the device memory, False otherwise.

    Parameters
    ----------
    maybe_symbol : Indexed or Function or Node
        The inspected object. May be a single Indexed or Function, or even an
        entire piece of IET.
    gpu_fit : list of Function
        The Function's which are known to definitely fit in the device memory. This
        information is given directly by the user through the compiler option
        `gpu-fit` and is propagated down here through the various stages of lowering.
    only_writes : bool, optional
        Only makes sense if `maybe_symbol` is an IET. If True, ignore all Function's
        that do not appear on the LHS of at least one Expression. Defaults to False.
    """
    try:
        functions = (maybe_symbol.function,)
    except AttributeError:
        assert maybe_symbol.is_Node
        iet = maybe_symbol
        functions = set(FindSymbols().visit(iet))
        if only_writes:
            expressions = FindNodes(Expression).visit(iet)
            functions &= {i.write for i in expressions}

    fsave = [f for f in functions if f.is_TimeFunction and is_integer(f.save)]
    if 'all-fallback' in gpu_fit and fsave:
        warning("TimeFunction %s assumed to fit the GPU memory" % fsave)
        return True

    return all(f in gpu_fit for f in fsave)
예제 #6
0
    def _simdize(self, iet):
        """
        Add pragmas to the Iteration/Expression tree to enforce SIMD auto-vectorization
        by the backend compiler.
        """
        ignore_deps = as_tuple(self._backend_compiler_pragma('ignore-deps'))

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            for i in vector_iterations:
                aligned = [
                    j for j in FindSymbols('symbolics').visit(i)
                    if j.is_DiscreteFunction
                ]
                if aligned:
                    simd = Ompizer.lang['simd-for-aligned']
                    simd = as_tuple(
                        simd(','.join([j.name for j in aligned]),
                             self.platform.simd_reg_size))
                else:
                    simd = as_tuple(Ompizer.lang['simd-for'])
                mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd)

        processed = Transformer(mapper).visit(iet)

        return processed, {}
예제 #7
0
    def _minimize_remainders(self, iet):
        """
        Adjust ROUNDABLE Iteration bounds so as to avoid the insertion of remainder
        loops by the backend compiler.
        """
        roundable = [
            i for i in FindNodes(Iteration).visit(iet) if i.is_Roundable
        ]

        mapper = {}
        for i in roundable:
            functions = FindSymbols().visit(i)

            # Get the SIMD vector length
            dtypes = {f.dtype for f in functions if f.is_Tensor}
            assert len(dtypes) == 1
            vl = configuration['platform'].simd_items_per_reg(dtypes.pop())

            # Round up `i`'s max point so that at runtime only vector iterations
            # will be performed (i.e., remainder loops won't be necessary)
            m, M, step = i.limits
            limits = (m, M + (i.symbolic_size % vl), step)

            mapper[i] = i._rebuild(limits=limits)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
예제 #8
0
def test_find_symbols_with_duplicates():
    grid = Grid(shape=(4, 4))
    x, y = grid.dimensions

    f = TimeFunction(name='f', grid=grid)
    g = Function(name='g', grid=grid)

    eq = Eq(f.forward, sin(g) * f + g)

    op = Operator(eq)

    exprs = FindNodes(Expression).visit(op)

    assert len(exprs) == 2

    # Two syntactically identical indexeds r0[x, y], but they're different
    # objects because they are constructed in two different places during
    # the CIRE pass
    r0a = exprs[0].output
    r0b = exprs[1].expr.rhs.args[0].args[0]
    assert r0a.function is r0b.function
    assert r0a.name == r0b.name == "r0"
    assert hash(r0a) == hash(r0b)
    assert r0a is not r0b

    # So we expect FindSymbols to catch five Indexeds in total
    symbols = FindSymbols('indexeds').visit(op)
    assert len(symbols) == 5
예제 #9
0
def iet_lower_dimensions(iet):
    """
    Replace all DerivedDimensions within the ``iet``'s expressions with
    lower-level symbolic objects (other Dimensions or Symbols).

        * Array indices involving SteppingDimensions are turned into ModuloDimensions.
          Example: ``u[t+1, x] = u[t, x] + 1 >>> u[t1, x] = u[t0, x] + 1``
        * Array indices involving ConditionalDimensions used are turned into
          integer-division expressions.
          Example: ``u[t_sub, x] = u[time, x] >>> u[time / 4, x] = u[time, x]``
    """
    # Lower SteppingDimensions
    for i in FindNodes(Iteration).visit(iet):
        if not i.uindices:
            # Be quick: avoid uselessy reconstructing nodes
            continue
        # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where
        # `u` and `v` are TimeFunction with circular time buffers (save=None) *but*
        # different modulo extent. The `t+1` indices above are therefore conceptually
        # different, so they will be replaced with the proper ModuloDimension through
        # two different calls to `xreplace`
        groups = as_mapper(i.uindices, lambda d: d.modulo)
        for k, v in groups.items():
            mapper = {d.origin: d for d in v}
            rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k
            replacer = lambda i: xreplace_indices(i, mapper, rule)
            iet = XSubs(replacer=replacer).visit(iet)

    # Lower ConditionalDimensions
    cdims = [d for d in FindSymbols('free-symbols').visit(iet)
             if isinstance(d, ConditionalDimension)]
    mapper = {d: IntDiv(d.index, d.factor) for d in cdims}
    iet = XSubs(mapper).visit(iet)

    return iet
예제 #10
0
파일: advanced.py 프로젝트: skkamyab/devito
    def _ompize(self, nodes, state):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """
        # Group by outer loop so that we can embed within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(nodes):
            # Determine the number of consecutive parallelizable Iterations
            key = lambda i: i.is_Parallel and\
                not (i.is_Elementizable or i.is_Vectorizable)
            candidates = filter_iterations(tree, key=key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        # Handle parallelizable loops
        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, tree in group.items():
                # Heuristic: if at least two parallel loops are available and the
                # physical core count is greater than self.thresholds['collapse'],
                # then omp-collapse the loops
                nparallel = len(tree)
                if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                        nparallel < 2:
                    parallel = omplang['for']
                else:
                    parallel = omplang['collapse'](nparallel)

                mapper[root] = root._rebuild(pragmas=root.pragmas +
                                             (parallel, ))

                # Track the thread-private and thread-shared variables
                private.extend([
                    i for i in FindSymbols('symbolics').visit(root)
                    if i.is_Array and i._mem_stack
                ])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=omplang['par-region'](private),
                               body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
예제 #11
0
파일: advanced.py 프로젝트: nw0/devito
    def _padding(self, nodes, state):
        """
        Introduce temporary buffers padded to the nearest multiple of the vector
        length, to maximize data alignment. At the bottom of the kernel, the
        values in the padded temporaries will be copied back into the input arrays.
        """
        mapper = OrderedDict()

        # Assess feasibility of the transformation
        handle = FindSymbols('symbolics-writes').visit(nodes)
        if not handle:
            return nodes, {}
        shape = max([i.shape for i in handle], key=len)
        if not shape:
            return nodes, {}
        candidates = [i for i in handle if i.shape[-1] == shape[-1]]
        if not candidates:
            return nodes, {}

        # Retrieve the maximum number of items in a SIMD register when processing
        # the expressions in /node/
        exprs = FindNodes(Expression).visit(nodes)
        exprs = [e for e in exprs if e.write in candidates]
        assert len(exprs) > 0
        dtype = exprs[0].dtype
        assert all(e.dtype == dtype for e in exprs)
        try:
            simd_items = get_simd_items(dtype)
        except KeyError:
            # Fallback to 16 (maximum expectable padding, for AVX512 registers)
            simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize

        shapes = {
            k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), )
            for k in candidates
        }
        mapper.update(
            OrderedDict([(k.indexed,
                          Array(name='p%s' % k.name,
                                shape=shapes[k],
                                dimensions=k.indices,
                                onstack=k._mem_stack).indexed)
                         for k in candidates]))

        # Substitute original arrays with padded buffers
        processed = SubstituteExpression(mapper).visit(nodes)

        # Build Iteration trees for initialization and copy-back of padded arrays
        mapper = OrderedDict([(k, v) for k, v in mapper.items()
                              if k.function.is_SymbolicFunction])
        init = copy_arrays(mapper, reverse=True)
        copyback = copy_arrays(mapper)

        processed = List(body=init + as_tuple(processed) + copyback)

        return processed, {}
예제 #12
0
파일: operator.py 프로젝트: speglich/devito
    def dimensions(self):
        ret = set().union(*[d._defines for d in self._dimensions])

        # During compilation other Dimensions may have been produced
        dimensions = FindSymbols('dimensions').visit(self)
        ret.update(d for d in dimensions if d.is_PerfKnob)

        ret = tuple(sorted(ret, key=attrgetter('name')))

        return ret
예제 #13
0
    def _make_clauses(cls, **kwargs):
        kwargs['chunk_size'] = False
        clauses = super(DeviceOpenACCIteration, cls)._make_clauses(**kwargs)

        partree = kwargs['nodes']
        deviceptrs = [i.name for i in FindSymbols().visit(partree) if i.is_Array]
        if deviceptrs:
            clauses.append("deviceptr(%s)" % ",".join(deviceptrs))

        return clauses
예제 #14
0
def test_minimize_reminders_due_to_autopadding():
    """
    Check that the bounds of the Iteration computing the DSE-captured aliasing
    expressions are relaxed (i.e., slightly larger) so that backend-compiler-generated
    remainder loops are avoided.
    """
    grid = Grid(shape=(3, 3, 3))
    x, y, z = grid.dimensions  # noqa
    t = grid.stepping_dim

    f = Function(name='f', grid=grid)
    f.data_with_halo[:] = 1.
    u = TimeFunction(name='u', grid=grid, space_order=3)
    u.data_with_halo[:] = 0.

    # Leads to 3D aliases
    eqn = Eq(
        u.forward,
        ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f +
         (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f + 1))
    op0 = Operator(eqn, dse='noop', dle=('advanced', {'openmp': False}))
    op1 = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': False}))

    x0_blk_size = op1.parameters[5]
    y0_blk_size = op1.parameters[9]
    z_size = op1.parameters[-1]

    # Check Array shape
    arrays = [
        i for i in FindSymbols().visit(op1._func_table['bf0'].root)
        if i.is_Array
    ]
    assert len(arrays) == 1
    a = arrays[0]
    assert len(a.dimensions) == 3
    assert a.halo == ((1, 1), (1, 1), (1, 1))
    assert a.padding == ((0, 0), (0, 0), (0, 30))
    assert Add(*a.symbolic_shape[0].args) == x0_blk_size + 2
    assert Add(*a.symbolic_shape[1].args) == y0_blk_size + 2
    assert Add(*a.symbolic_shape[2].args) == z_size + 32

    # Check loop bounds
    trees = retrieve_iteration_tree(op1._func_table['bf0'].root)
    assert len(trees) == 2
    expected_rounded = trees[0].inner
    assert expected_rounded.symbolic_max ==\
        z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1

    # Check numerical output
    op0(time_M=1)
    exp = np.copy(u.data[:])
    u.data_with_halo[:] = 0.
    op1(time_M=1)
    assert np.all(u.data == exp)
예제 #15
0
def test_find_symbols_nested(mode, expected):
    grid = Grid(shape=(4, 4, 4))
    call = Call('foo', [
        Call('bar',
             [Symbol(name='x'),
              Call('baz', [Function(name='f', grid=grid)])])
    ])

    found = FindSymbols(mode).visit(call)

    assert [f.name for f in found] == eval(expected)
예제 #16
0
    def make_parallel(self, iet):
        """
        Transform ``iet`` by decorating its parallel :class:`Iteration`s with
        suitable ``#pragma omp ...`` for thread-level parallelism.
        """
        # Group sequences of loops that should go within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Determine the number of consecutive parallelizable Iterations
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, candidates in group.items():
                mapper.update(self._make_parallel_tree(root, candidates))

                # Track the thread-private and thread-shared variables
                private.extend([
                    i for i in FindSymbols('symbolics').visit(root)
                    if i.is_Array and i._mem_stack
                ])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=self.lang['par-region'](private),
                               body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region
        processed = Transformer(mapper).visit(iet)

        # Hack/workaround to the fact that the OpenMP pragmas are not true
        # IET nodes, so the `nthreads` variables won't be detected as a
        # Callable parameter unless inserted in a mock Expression
        if mapper:
            nt = NThreads()
            eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32),
                                         nt))
            return List(body=[eq, processed]), {'input': [nt]}
        else:
            return List(body=processed), {}
예제 #17
0
def test_find_symbols():
    grid = Grid(shape=(4, 4))
    x, y = grid.dimensions

    f = Function(name='f', grid=grid)

    op = Operator(Eq(f, f[x - 1, y] + f[x + 1, y] + 1))

    symbols = FindSymbols('indexeds').visit(op)

    assert len(symbols) == 3
예제 #18
0
파일: utils.py 프로젝트: speglich/devito
def derive_parameters(iet, drop_locals=False):
    """
    Derive all input parameters (function call arguments) from an IET
    by collecting all symbols not defined in the tree itself.
    """
    # Extract all candidate parameters
    candidates = FindSymbols().visit(iet)

    # Symbols, Objects, etc, become input parameters as well
    basics = FindSymbols('basics').visit(iet)
    candidates.extend(i.function for i in basics)

    # Filter off duplicates (e.g., `x_size` is extracted by both calls to FindSymbols)
    candidates = filter_ordered(candidates)

    # Filter off symbols which are defined somewhere within `iet`
    defines = [s.name for s in FindSymbols('defines').visit(iet)]
    parameters = [s for s in candidates if s.name not in defines]

    # Drop globally-visible objects
    parameters = [
        p for p in parameters if not isinstance(p, (Global, Keyword, Macro))
    ]

    # Maybe filter out all other compiler-generated objects
    if drop_locals:
        parameters = [
            p for p in parameters if not isinstance(p, (Array, LocalObject))
        ]

    return parameters
예제 #19
0
def _lower_conditional_dims(iet):
    """
    Lower ConditionalDimensions: index functions involving ConditionalDimensions
    are turned into integer-division expressions.

    Examples
    --------
    u[t_sub, x] = u[time, x]

    becomes

    u[time / 4, x] = u[time, x]
    """
    cdims = [d for d in FindSymbols('free-symbols').visit(iet)
             if isinstance(d, ConditionalDimension)]
    mapper = {d: IntDiv(d.index, d.factor) for d in cdims}
    iet = XSubs(mapper).visit(iet)

    return iet
예제 #20
0
def test_transformer_replace_function_body(block1, block2):
    """Create a Function and replace its body with another."""
    args = FindSymbols().visit(block1)
    f = Callable('foo', block1, 'void', args)
    assert str(f.ccode) == """void foo()
{
  for (int i = 0; i < 3; i += 1)
  {
    for (int j = 0; j < 5; j += 1)
    {
      for (int k = 0; k < 7; k += 1)
      {
        a[i] = a[i] + b[i] + 5.0F;
      }
    }
  }
}"""

    f = Transformer({block1: block2}).visit(f)
    assert str(f.ccode) == """void foo()
예제 #21
0
    def make_parallel(self, iet):
        """
        Transform ``iet`` by decorating its parallel :class:`Iteration`s with
        suitable ``#pragma omp ...`` triggering thread-level parallelism.
        """
        # Group sequences of loops that should go within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Determine the number of consecutive parallelizable Iterations
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, candidates in group.items():
                mapper.update(self._make_parallel_tree(root, candidates))

                # Track the thread-private and thread-shared variables
                private.extend([i for i in FindSymbols('symbolics').visit(root)
                                if i.is_Array and i._mem_stack])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=self.lang['par-region'](private), body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region

        return Transformer(mapper).visit(iet)
예제 #22
0
    def _make_clauses(cls, **kwargs):
        kwargs['chunk_size'] = False
        clauses = super(DeviceOpenACCIteration, cls)._make_clauses(**kwargs)

        symbols = FindSymbols().visit(kwargs['nodes'])

        deviceptrs = [i.name for i in symbols if i.is_Array and i._mem_default]
        presents = [i.name for i in symbols
                    if (i.is_AbstractFunction and
                        is_on_device(i, kwargs['gpu_fit']) and
                        i.name not in deviceptrs)]

        # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for
        # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used
        if presents:
            clauses.append("present(%s)" % ",".join(presents))

        if deviceptrs:
            clauses.append("deviceptr(%s)" % ",".join(deviceptrs))

        return clauses
예제 #23
0
    def make_parallel(self, iet):
        """Transform ``iet`` by introducing shared-memory parallelism."""
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the first omp-parallelizable Iteration in `tree`
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                continue
            root = candidates[0]

            # Build the `omp-for` tree
            partree = self._make_parallel_tree(root, candidates)

            # Find out the thread-private and thread-shared variables
            private = [
                i for i in FindSymbols().visit(partree)
                if i.is_Array and i._mem_stack
            ]

            # Build the `omp-parallel` region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            partree = Block(header=self.lang['par-region'](self.nthreads.name,
                                                           private),
                            body=partree)

            # Do not enter the parallel region if the step increment might be 0; this
            # would raise a `Floating point exception (core dumped)` in some OpenMP
            # implementation. Note that using an OpenMP `if` clause won't work
            if isinstance(root.step, Symbol):
                cond = Conditional(CondEq(root.step, 0),
                                   Element(c.Statement('return')))
                partree = List(body=[cond, partree])

            mapper[root] = partree
        iet = Transformer(mapper).visit(iet)

        return iet, {'input': [self.nthreads] if mapper else []}
예제 #24
0
def make_yask_kernels(iet, **kwargs):
    yk_solns = kwargs.pop('yk_solns')

    mapper = {}
    for n, (section, trees) in enumerate(find_affine_trees(iet).items()):
        dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees)))

        # Retrieve the section dtype
        exprs = FindNodes(Expression).visit(section)
        dtypes = {e.dtype for e in exprs}
        if len(dtypes) != 1:
            log("Unable to offload in presence of mixed-precision arithmetic")
            continue
        dtype = dtypes.pop()

        context = contexts.fetch(dimensions, dtype)

        # A unique name for the 'real' compiler and kernel solutions
        name = namespace['jit-soln'](Signer._digest(configuration,
                                                    *[i.root for i in trees]))

        # Create a YASK compiler solution for this Operator
        yc_soln = context.make_yc_solution(name)

        try:
            # Generate YASK vars and populate `yc_soln` with equations
            local_vars = yaskit(trees, yc_soln)

            # Build the new IET nodes
            yk_soln_obj = YASKSolnObject(namespace['code-soln-name'](n))
            funcall = make_sharedptr_funcall(namespace['code-soln-run'],
                                             ['time'], yk_soln_obj)
            funcall = Offloaded(funcall, dtype)
            mapper[trees[0].root] = funcall
            mapper.update({i.root: mapper.get(i.root)
                           for i in trees})  # Drop trees

            # JIT-compile the newly-created YASK kernel
            yk_soln = context.make_yk_solution(name, yc_soln, local_vars)
            yk_solns[(dimensions, yk_soln_obj)] = yk_soln

            # Print some useful information about the newly constructed solution
            log("Solution '%s' contains %d var(s) and %d equation(s)." %
                (yc_soln.get_name(), yc_soln.get_num_vars(),
                 yc_soln.get_num_equations()))
        except NotImplementedError as e:
            log("Unable to offload a candidate tree. Reason: [%s]" % str(e))
    iet = Transformer(mapper).visit(iet)

    if not yk_solns:
        log("No offloadable trees found")

    # Some Iteration/Expression trees are not offloaded to YASK and may
    # require further processing to be executed through YASK, due to the
    # different storage layout
    yk_var_objs = {
        i.name: YASKVarObject(i.name)
        for i in FindSymbols().visit(iet) if i.from_YASK
    }
    yk_var_objs.update({i: YASKVarObject(i) for i in get_local_vars(yk_solns)})
    iet = make_var_accesses(iet, yk_var_objs)

    # The signature needs to be updated
    # TODO: this could be done automagically through the iet pass engine, but
    # currently it only supports *appending* to the parameters list. While here
    # we actually need to change it as some parameters may disappear (x_m, x_M, ...)
    parameters = derive_parameters(iet, True)
    iet = iet._rebuild(parameters=parameters)

    return iet, {}
예제 #25
0
def autotune(operator, arguments, parameters, tunable):
    """
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    """
    # We get passed all the arguments, but the cfunction only requires a subset
    at_arguments = OrderedDict([(p.name, arguments[p.name])
                                for p in parameters])

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    iterations = FindNodes(Iteration).visit(operator.body)
    dim_mapper = {i.dim.name: i.dim for i in iterations}

    # Shrink the iteration space of time-stepping dimension so that auto-tuner
    # runs will finish quickly
    steppers = [i for i in iterations if i.dim.is_Time]
    if len(steppers) == 0:
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers[0]
        start = at_arguments[stepper.dim.min_name]
        timesteps = stepper.extent(start=start,
                                   finish=options['at_squeezer']) - 1
        if timesteps < 0:
            timesteps = options['at_squeezer'] - timesteps
            perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps)
        at_arguments[stepper.dim.min_name] = start
        at_arguments[stepper.dim.max_name] = timesteps
        if stepper.dim.is_Stepping:
            at_arguments[stepper.dim.parent.min_name] = start
            at_arguments[stepper.dim.parent.max_name] = timesteps
    else:
        warning("AutoTuner: Couldn't understand loop structure; giving up")
        return arguments

    # Attempted block sizes ...
    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
    # ... Defaults (basic mode)
    blocksizes = [
        OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']
    ]
    # ... Always try the entire iteration space (degenerate block)
    itershape = [
        mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper
    ]
    blocksizes.append(
        OrderedDict([(i, mapper[i].iteration.extent(0, j - 1))
                     for i, j in zip(mapper, itershape)]))
    # ... More attempts if auto-tuning in aggressive mode
    if configuration['autotuning'].level == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # How many temporaries are allocated on the stack?
    # Will drop block sizes that might lead to a stack overflow
    functions = FindSymbols('symbolics').visit(operator.body +
                                               operator.elemental_functions)
    stack_shapes = [
        i.symbolic_shape for i in functions if i.is_Array and i._mem_stack
    ]
    stack_space = sum(reduce(mul, i, 1)
                      for i in stack_shapes) * operator._dtype().itemsize

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for bs in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in bs:
                val = bs[k]
                start = mapper[k].original_dim.symbolic_start.subs(arguments)
                end = mapper[k].original_dim.symbolic_end.subs(arguments)

                if val <= mapper[k].iteration.extent(start, end):
                    at_arguments[k] = val
                else:
                    # Block size cannot be larger than actual dimension
                    illegal = True
                    break
        if illegal:
            continue

        # Make sure we remain within stack bounds, otherwise skip block size
        dim_sizes = {}
        for k, v in at_arguments.items():
            if k in bs:
                dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
            elif k in dim_mapper:
                dim_sizes[dim_mapper[k].symbolic_size] = v
        try:
            bs_stack_space = stack_space.xreplace(dim_sizes)
        except AttributeError:
            bs_stack_space = stack_space
        try:
            if int(bs_stack_space) > options['at_stack_limit']:
                continue
        except TypeError:
            # We should never get here
            warning(
                "AutoTuner: Couldn't determine stack size; skipping block shape %s"
                % str(bs))
            continue

        # Use AutoTuner-specific profiler structs
        timer = operator.profiler.timer.reset()
        at_arguments[operator.profiler.name] = timer

        operator.cfunction(*list(at_arguments.values()))
        elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_)
        timings[tuple(bs.items())] = elapsed
        perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" %
             (','.join('%d' % i for i in bs.values()), elapsed, timesteps))

    try:
        best = dict(min(timings, key=timings.get))
        perf("AutoTuner: Selected block shape %s" % best)
    except ValueError:
        warning("AutoTuner: Couldn't find legal block shapes")
        return arguments

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    # Reset the profiling struct
    assert operator.profiler.name in tuned
    tuned[operator.profiler.name] = operator.profiler.timer.reset()

    return tuned
예제 #26
0
파일: advanced.py 프로젝트: skkamyab/devito
    def _minimize_remainders(self, nodes, state):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]
            if root.tag is None:
                continue

            # Padding
            writes = [
                i for i in FindSymbols('symbolics-writes').visit(root)
                if i.is_Array
            ]
            padding = []
            for i in writes:
                try:
                    simd_items = get_simd_items(i.dtype)
                except KeyError:
                    # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                    simd_items = simdinfo['avx512f'] / np.dtype(
                        i.dtype).itemsize
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, ))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.end_symbolic
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1]
                            for i in FindSymbols().visit(root))
            for i in root.uindices:
                for j in externals:
                    condition.append(root.end_symbolic + padding < j)
            condition = ' || '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func(name='_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' %
                          (condition, ccode(endpoint + padding), endpoint)))

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
예제 #27
0
파일: basic.py 프로젝트: nw0/devito
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            maybe_required = set()  # Scalars that *may* have to be passed in
            not_required = set()  # Elemental function locally declared scalars

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                args.extend(zip([ccode(j) for j in bounds], (start, finish)))
                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                args.extend(zip([ccode(j.start) for j in i.uindices], ufunc))
                limits = [Symbol(start.name), Symbol(finish.name), 1]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))
                not_required.update({i.dim}, set(j.index for j in i.uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)
            expressions = FindNodes(Expression).visit(free)
            fsymbols = FindSymbols('symbolics').visit(free)

            # Add all definitely-required arguments
            not_required.update({i.output for i in expressions if i.is_scalar})
            for i in fsymbols:
                if i in not_required:
                    continue
                elif i.is_Array:
                    args.append(
                        ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i))
                elif i.is_TensorFunction:
                    args.append(("%s_vec" % i.name, i))
                elif i.is_Scalar:
                    args.append((i.name, i))

            # Add all maybe-required arguments that turn out to be required
            maybe_required.update(
                set(FindSymbols(mode='free-symbols').visit(free)))
            for i in fsymbols:
                not_required.update({as_symbol(i), i.indexify()})
                for j in i.symbolic_shape:
                    maybe_required.update(j.free_symbols)
            required = filter_sorted(maybe_required - not_required,
                                     key=attrgetter('name'))
            args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype))
                         for i in required])

            call, params = zip(*args)
            handle = flatten([p.rtargs for p in params])
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name, Callable(name, free, 'void', handle, ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
예제 #28
0
def opsit(trees, count):
    node_factory = OPSNodeFactory()
    expressions = []
    for tree in trees:
        expressions.extend(FindNodes(Expression).visit(tree.inner))

    it_range = []
    it_dims = 0
    for tree in trees:
        if isinstance(tree, IterationTree):
            it_range = [it.bounds() for it in tree]
            it_dims = len(tree)

    block = OPSBlock(namespace['ops_block'](count))
    block_init = Element(
        cgen.Initializer(
            block, Call("ops_decl_block",
                        [it_dims, String(block.name)], False)))

    ops_expressions = []
    accesses = defaultdict(set)

    for i in reversed(expressions):
        extend_accesses(accesses, get_accesses(i.expr))
        ops_expressions.insert(0,
                               Expression(make_ops_ast(i.expr, node_factory)))

    ops_stencils_initializers, ops_stencils = generate_ops_stencils(accesses)

    to_remove = [
        f.name for f in FindSymbols('defines').visit(List(body=expressions))
    ]

    parameters = FindSymbols('symbolics').visit(List(body=ops_expressions))
    parameters = [
        p for p in parameters
        if p.name != 'OPS_ACC_size' and p.name not in to_remove
    ]
    parameters = sorted(parameters, key=lambda i: (i.is_Constant, i.name))

    arguments = FindSymbols('symbolics').visit(List(body=expressions))
    arguments = [a for a in arguments if a.name not in to_remove]
    arguments = sorted(arguments, key=lambda i: (i.is_Constant, i.name))

    ops_expressions = [
        Expression(fix_ops_acc(e.expr, [p.name for p in parameters]))
        for e in ops_expressions
    ]

    callable_kernel = Callable(namespace['ops_kernel'](count), ops_expressions,
                               "void", parameters)

    dat_declarations = []
    argname_to_dat = {}

    for a in arguments:
        if a.is_Constant:
            continue

        dat_dec, dat_sym = to_ops_dat(a, block)
        dat_declarations.extend(dat_dec)

        argname_to_dat.update(dat_sym)

    par_loop_range_arr = SymbolicArray(name=namespace['ops_range'](count),
                                       dimensions=(len(it_range) * 2, ),
                                       dtype=np.int32)
    range_vals = []
    for mn, mx in it_range:
        range_vals.append(mn)
        range_vals.append(mx)
    par_loop_range_init = Expression(
        ClusterizedEq(Eq(par_loop_range_arr, ListInitializer(range_vals))))

    ops_args = get_ops_args([p for p in parameters], ops_stencils,
                            argname_to_dat)

    par_loop = Call("ops_par_loop", [
        FunctionPointer(callable_kernel.name),
        String(callable_kernel.name), block, it_dims, par_loop_range_arr,
        *ops_args
    ])

    return (callable_kernel,
            [par_loop_range_init, block_init] + ops_stencils_initializers +
            dat_declarations + [Call("ops_partition", [String("")])],
            List(body=[par_loop]), it_dims)
예제 #29
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            defined_args = {}  # Map of argument values defined by loop bounds

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                defined_args[start.name] = bounds[0]
                defined_args[finish.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update(
                    {uf.name: j.start
                     for uf, j in zip(ufunc, i.uindices)})
                limits = [
                    Scalar(name=start.name, dtype=np.int32),
                    Scalar(name=finish.name, dtype=np.int32), 1
                ]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)

            # Insert array casts for all non-defined
            f_symbols = FindSymbols('symbolics').visit(free)
            defines = [s.name for s in FindSymbols('defines').visit(free)]
            casts = [
                ArrayCast(f) for f in f_symbols
                if f.is_Tensor and f.name not in defines
            ]
            free = (List(body=casts), free)

            for i in derive_parameters(free):
                if i.name in defined_args:
                    args.append((defined_args[i.name], i))
                elif i.is_Dimension:
                    d = Scalar(name=i.name, dtype=i.dtype)
                    args.append((d, d))
                else:
                    args.append((i, i))

            call, params = zip(*args)
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name,
                Callable(name, free, 'void', flatten(params), ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
예제 #30
0
파일: rewriters.py 프로젝트: jrt54/devito
    def _minimize_remainders(self, iet):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        # The innermost dimension is the one that might get padded
        p_dim = -1

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]

            # Padding
            writes = [i.write for i in FindNodes(Expression).visit(root)
                      if i.write.is_Array]
            padding = []
            for i in writes:
                try:
                    simd_items = self.platform.simd_items_per_reg(i.dtype)
                except KeyError:
                    return iet, {}
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding)
                    i.update(padding=i._padding[:p_dim] + (padded,))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.symbolic_max
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)
                            if i.is_Tensor)
            for i in root.uindices:
                for j in externals:
                    condition.append(root.symbolic_max + padding < j)
            condition = ' && '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func('_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' % (condition,
                                              ccode(endpoint + padding),
                                              endpoint))
            )

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(iet)

        return processed, {}