Пример #1
0
    def place_casts(self, iet, **kwargs):
        """
        Create a new IET with the necessary type casts.

        Parameters
        ----------
        iet : Callable
            The input Iteration/Expression tree.
        """
        # Candidates
        indexeds = FindSymbols('indexeds|indexedbases').visit(iet)

        # A cast is needed only if the underlying data object isn't already
        # defined inside the kernel, which happens, for example, when:
        # (i) Dereferencing a PointerArray, e.g., `float (*r0)[.] = (float(*)[.]) pr0[.]`
        # (ii) Declaring a raw pointer, e.g., `float * r0 = NULL; *malloc(&(r0), ...)
        defines = set(FindSymbols('defines').visit(iet))
        needs_cast = lambda f: f.indexed not in defines

        # Create Function -> n-dimensional array casts
        # E.g. `float (*u)[.] = (float (*)[.]) u_vec->data`
        functions = sorted({i.function
                            for i in indexeds},
                           key=lambda i: i.name)
        casts = [self.lang.PointerCast(f) for f in functions if needs_cast(f)]

        # Incorporate the newly created casts
        if casts:
            iet = iet._rebuild(body=iet.body._rebuild(casts=casts))

        return iet, {}
Пример #2
0
    def place_casts(self, iet, **kwargs):
        """
        Create a new IET with the necessary type casts.

        Parameters
        ----------
        iet : Callable
            The input Iteration/Expression tree.
        """
        indexeds = FindSymbols('indexeds|indexedbases').visit(iet)
        defines = set(FindSymbols('defines').visit(iet))

        # The _C_name represents the name of the Function among the
        # `iet.parameters`). If this differs from the name used within the
        # expressions, then it implies a cast is required
        needs_cast = lambda f: f not in defines and f._C_name != f.name

        # Create Function -> n-dimensional array casts
        # E.g. `float (*u)[u_vec->size[1]] = (float (*)[u_vec->size[1]]) u_vec->data`
        functions = sorted({i.function
                            for i in indexeds},
                           key=lambda i: i.name)
        casts = [self.lang.PointerCast(f) for f in functions if needs_cast(f)]

        # Incorporate the newly created casts
        if casts:
            iet = iet._rebuild(body=iet.body._rebuild(casts=casts))

        return iet, {}
Пример #3
0
    def place_casts(self, iet):
        """
        Create a new IET with the necessary type casts.

        Parameters
        ----------
        iet : Callable
            The input Iteration/Expression tree.
        """
        functions = FindSymbols().visit(iet)
        need_cast = {i for i in functions if i.is_Tensor}

        # Make the generated code less verbose by avoiding unnecessary casts
        indexed_names = {i.name for i in FindSymbols('indexeds').visit(iet)}
        need_cast = {
            i
            for i in need_cast if i.name in indexed_names or i.is_ArrayBasic
        }

        casts = tuple(PointerCast(i) for i in iet.parameters if i in need_cast)
        if casts:
            casts = (List(body=casts, footer=c.Line()), )

        iet = iet._rebuild(body=casts + iet.body)

        return iet, {}
Пример #4
0
def test_tti_rewrite_aggressive(tti_nodse):
    operator = tti_operator(dse='aggressive')
    rec, u, v, _ = operator.forward(kernel='centered', save=False)

    assert np.allclose(tti_nodse[0].data, v.data, atol=10e-1)
    assert np.allclose(tti_nodse[1].data, rec.data, atol=10e-1)

    # Also check that DLE's loop blocking with DSE=aggressive does the right thing
    # There should be exactly two BlockDimensions; bugs in the past were generating
    # either code with no blocking (zero BlockDimensions) or code with four
    # BlockDimensions (i.e., Iteration folding was somewhat broken)
    op = operator.op_fwd(kernel='centered', save=False)
    block_dims = [i for i in op.dimensions if isinstance(i, BlockDimension)]
    assert len(block_dims) == 2

    # Also, in this operator, we expect six temporary Arrays:
    # * four Arrays are allocated on the heap
    # * two Arrays are allocated on the stack and only appear within an efunc
    arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
    assert len(arrays) == 4
    assert all(i._mem_heap and not i._mem_external for i in arrays)
    arrays = [
        i for i in FindSymbols().visit(op._func_table['bf0'].root)
        if i.is_Array
    ]
    assert len(arrays) == 6
    assert all(not i._mem_external for i in arrays)
    assert len([i for i in arrays if i._mem_heap]) == 4
    assert len([i for i in arrays if i._mem_stack]) == 2
Пример #5
0
    def make_simd(self, iet, **kwargs):
        """
        Create a new IET with SIMD parallelism via OpenMP pragmas.
        """
        simd_reg_size = kwargs.pop('simd_reg_size')

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_Parallel]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Construct OpenMP SIMD pragma
            aligned = [j for j in FindSymbols('symbolics').visit(candidate)
                       if j.is_DiscreteFunction]
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(simd(','.join([j.name for j in aligned]),
                                simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
Пример #6
0
    def _make_parallel(self, iet):
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The used `nthreads` arguments
        args = [i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin))]

        return iet, {'args': args, 'includes': ['omp.h']}
Пример #7
0
    def test_streaming_multi_input(self, opt, ntmps):
        nt = 100
        grid = Grid(shape=(10, 10))

        u = TimeFunction(name='u',
                         grid=grid,
                         save=nt,
                         time_order=2,
                         space_order=2)
        v = TimeFunction(name='v',
                         grid=grid,
                         save=None,
                         time_order=2,
                         space_order=2)
        grad = Function(name='grad', grid=grid)
        grad1 = Function(name='grad', grid=grid)

        v.data[:] = 0.02
        for i in range(nt):
            u.data[i, :] = i + 0.1

        eqn = Eq(grad, grad - u.dt2 * v)

        op0 = Operator(eqn, opt=('noop', {'gpu-fit': u}))
        op1 = Operator(eqn, opt=opt)

        # Check generated code
        assert len(op1._func_table) == 3
        assert len([i for i in FindSymbols().visit(op1)
                    if i.is_Array]) == ntmps

        op0.apply(time_M=nt - 2, dt=0.1)
        op1.apply(time_M=nt - 2, dt=0.1, grad=grad1)

        assert np.all(grad.data == grad1.data)
Пример #8
0
    def _make_clauses(cls,
                      ncollapse=None,
                      reduction=None,
                      tile=None,
                      **kwargs):
        clauses = []

        if ncollapse:
            clauses.append('collapse(%d)' % (ncollapse or 1))
        elif tile:
            clauses.append('tile(%s)' % ','.join(str(i) for i in tile))

        if reduction:
            clauses.append(make_clause_reduction(reduction))

        indexeds = FindSymbols('indexeds').visit(kwargs['nodes'])
        deviceptrs = filter_ordered(i.name for i in indexeds
                                    if i.function._mem_local)
        presents = filter_ordered(i.name for i in indexeds if (
            is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs))

        # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for
        # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used
        if presents:
            clauses.append("present(%s)" % ",".join(presents))

        if deviceptrs:
            clauses.append("deviceptr(%s)" % ",".join(deviceptrs))

        return clauses
Пример #9
0
    def make_simd(self, iet):
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_ParallelRelaxed]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Only fully-parallel Iterations will be SIMD-ized (ParallelRelaxed
            # might not be enough then)
            if not candidate.is_Parallel:
                continue

            # Add SIMD pragma
            aligned = [j for j in FindSymbols('symbolics').visit(candidate)
                       if j.is_DiscreteFunction]
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(simd(','.join([j.name for j in aligned]),
                                self.simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
Пример #10
0
    def test_save_w_nonaffine_time(self):
        factor = 4
        grid = Grid(shape=(11, 11))
        x, y = grid.dimensions
        t = grid.stepping_dim
        time = grid.time_dim

        time_subsampled = ConditionalDimension('t_sub',
                                               parent=time,
                                               factor=factor)

        f = Function(name='f', grid=grid, dtype=np.int32)
        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             save=2,
                             time_dim=time_subsampled)

        save_shift = Constant(name='save_shift', dtype=np.int32)

        eqns = [
            Eq(u.forward, u[t, f[x, x], f[y, y]] + 1.),
            Eq(usave.subs(time_subsampled, time_subsampled - save_shift), u)
        ]

        op = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate'))

        # We just check the generated code here
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 1
        assert len(op._func_table) == 2
Пример #11
0
    def test_unread_buffered_function(self):
        nt = 10
        grid = Grid(shape=(4, 4))
        time = grid.time_dim

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid)
        v1 = TimeFunction(name='v', grid=grid)

        eqns = [Eq(v.forward, v + 1, implicit_dims=time), Eq(u, v)]

        op0 = Operator(eqns, opt='noop')
        op1 = Operator(eqns, opt='buffering')

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 1
        buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
        assert len(buffers) == 1

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)
Пример #12
0
    def test_async_degree(self, async_degree):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)

        eqn = Eq(u.forward, u + 1)

        op0 = Operator(eqn, opt='noop')
        op1 = Operator(eqn,
                       opt=('buffering', {
                           'buf-async-degree': async_degree
                       }))

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 2
        buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
        assert len(buffers) == 1
        assert buffers.pop().symbolic_shape[0] == async_degree

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1)

        assert np.all(u.data == u1.data)
Пример #13
0
    def test_hoisting_if_coupled(self):
        """
        Test that coupled aliases are successfully hoisted out of the time loop.
        """
        grid = Grid((10, 10))

        a = Function(name="a", grid=grid, space_order=4)
        b = Function(name="b", grid=grid, space_order=4)

        e = TimeFunction(name="e", grid=grid, space_order=4)
        f = TimeFunction(name="f", grid=grid, space_order=4)

        subexpr0 = sqrt(1. + 1. / a)
        subexpr1 = 1 / (8. * subexpr0 - 8. / b)
        eqns = [
            Eq(e.forward, e + 1),
            Eq(f.forward, f * subexpr0 - f * subexpr1 + e.forward.dx)
        ]

        op = Operator(eqns)

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 3
        arrays = [i for i in FindSymbols().visit(trees[0].root) if i.is_Array]
        assert len(arrays) == 2
        assert all(i._mem_heap and not i._mem_external for i in arrays)
Пример #14
0
def test_read_only():
    nt = 10
    grid = Grid(shape=(2, 2))

    u = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(nt):
        u.data[i, :] = i

    eqns = [Eq(v.forward, v + u.backward + u + u.forward + 1.)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 2
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, v=v1)

    assert np.all(v.data == v1.data)
Пример #15
0
def test_over_injection():
    nt = 10
    grid = Grid(shape=(4, 4))

    src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=nt)
    rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=nt)
    u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt)
    u1 = TimeFunction(name="u",
                      grid=grid,
                      time_order=2,
                      space_order=2,
                      save=nt)

    src.data[:] = 1.

    eqns = ([Eq(u.forward, u + 1)] + src.inject(field=u.forward, expr=src) +
            rec.interpolate(expr=u.forward))

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) ==\
        5 + bool(configuration['language'] != 'C')
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, u=u1)

    assert np.all(u.data == u1.data)
Пример #16
0
def test_two_heterogeneous_buffers():
    nt = 10
    grid = Grid(shape=(4, 4))

    u = TimeFunction(name='u', grid=grid, save=nt)
    u1 = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid, save=nt)
    v1 = TimeFunction(name='v', grid=grid, save=nt)

    for i in range(nt):
        u.data[i, :] = i
        u1.data[i, :] = i

    eqns = [Eq(u.forward, u + v + 1), Eq(v.forward, u + v + v.backward)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 3
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 2

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, u=u1, v=v1)

    assert np.all(u.data == u1.data)
    assert np.all(v.data == v1.data)
Пример #17
0
def test_read_only_backwards_unstructured():
    """
    Instead of the class `time-1`, `time`, and `time+1`, here we access the
    buffered Function via `time-2`, `time-1` and `time+2`.
    """
    nt = 10
    grid = Grid(shape=(2, 2))

    u = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(nt):
        u.data[i, :] = i

    eqns = [
        Eq(v.backward,
           v + u.backward.backward + u.backward + u.forward.forward + 1.)
    ]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 2
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_m=2)
    op1.apply(time_m=2, v=v1)

    assert np.all(v.data == v1.data)
Пример #18
0
    def test_tasking_unfused_two_locks(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp0 = Function(name='tmp0', grid=grid)
        tmp1 = Function(name='tmp1', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        w = TimeFunction(name='w', grid=grid)

        eqns = [
            Eq(w.forward, w + 1),
            Eq(tmp0, w.forward),
            Eq(tmp1, w.forward),
            Eq(u.forward, tmp0, subdomain=bundle0),
            Eq(v.forward, tmp1, subdomain=bundle0)
        ]

        op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 7
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 2
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 4
        assert (str(sections[1].body[0].body[0].body[0].body[0]) ==
                'while(lock0[0] == 0 || lock1[0] == 0);')  # Wait-lock
        body = sections[2].body[0].body[0]
        assert (str(body.body[1].condition) ==
                'Ne(lock0[0], 2) | Ne(FieldFromComposite(sdata0[wi0]), 1)'
                )  # Wait-thread
        assert (str(body.body[1].body[0]) == 'wi0 = (wi0 + 1)%(npthreads0);')
        assert str(body.body[2]) == 'sdata0[wi0].time = time;'
        assert str(body.body[3]) == 'lock0[0] = 0;'  # Set-lock
        assert str(body.body[4]) == 'sdata0[wi0].flag = 2;'
        body = sections[3].body[0].body[0]
        assert (str(body.body[1].condition) ==
                'Ne(lock1[0], 2) | Ne(FieldFromComposite(sdata1[wi1]), 1)'
                )  # Wait-thread
        assert (str(body.body[1].body[0]) == 'wi1 = (wi1 + 1)%(npthreads1);')
        assert str(body.body[2]) == 'sdata1[wi1].time = time;'
        assert str(body.body[3]) == 'lock1[0] = 0;'  # Set-lock
        assert str(body.body[4]) == 'sdata1[wi1].flag = 2;'
        assert len(op._func_table) == 4
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 18
        assert str(exprs[14]) == 'lock0[0] = 1;'
        assert exprs[15].write is u
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host1'].root)
        assert str(exprs[14]) == 'lock1[0] = 1;'
        assert exprs[15].write is v

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 9)
        assert np.all(v.data[nt - 1] == 9)
Пример #19
0
    def test_composite_full(self):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)
        v1 = TimeFunction(name='v', grid=grid, save=nt)

        eqns = [Eq(u.forward, u + v + 1), Eq(v.forward, u + v + v.backward)]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': (u, v)}))
        op1 = Operator(eqns,
                       opt=('buffering', 'tasking', 'streaming',
                            'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 9
        assert len(
            [i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 2

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)
Пример #20
0
    def test_subdimensions(self):
        nt = 10
        grid = Grid(shape=(10, 10, 10))
        x, y, z = grid.dimensions
        xi = SubDimension.middle(name='xi',
                                 parent=x,
                                 thickness_left=2,
                                 thickness_right=2)
        yi = SubDimension.middle(name='yi',
                                 parent=y,
                                 thickness_left=2,
                                 thickness_right=2)
        zi = SubDimension.middle(name='zi',
                                 parent=z,
                                 thickness_left=2,
                                 thickness_right=2)

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)

        eqn = Eq(u.forward, u + 1).xreplace({x: xi, y: yi, z: zi})

        op0 = Operator(eqn, opt='noop')
        op1 = Operator(eqn, opt='buffering')

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 2
        assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 1

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1)

        assert np.all(u.data == u1.data)
Пример #21
0
    def test_composite_streaming_tasking(self):
        nt = 10
        grid = Grid(shape=(10, 10, 10))

        u = TimeFunction(name='u', grid=grid)
        u1 = TimeFunction(name='u', grid=grid)
        fsave = TimeFunction(name='fsave', grid=grid, save=nt)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        for i in range(nt):
            fsave.data[i, :] = i

        eqns = [Eq(u.forward, u + fsave + 1),
                Eq(usave, u)]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': (fsave, usave)}))
        op1 = Operator(eqns, opt=('tasking', 'streaming', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op0)) == 1
        assert len(retrieve_iteration_tree(op1)) == 4
        symbols = FindSymbols().visit(op1)
        assert len([i for i in symbols if isinstance(i, Lock)]) == 1
        threads = [i for i in symbols if isinstance(i, PThreadArray)]
        assert len(threads) == 2
        assert threads[0].size == 1
        assert threads[1].size.data == 2

        op0.apply(time_M=nt-1)
        op1.apply(time_M=nt-1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)
Пример #22
0
    def test_tasking_over_compiler_generated(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(4, 4, 4), subdomains=bundle0)

        u = TimeFunction(name='u', grid=grid, space_order=2)
        u1 = TimeFunction(name='u', grid=grid, space_order=2)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        eqns = [Eq(u.forward, u.dx.dx*0.042 + 1),
                Eq(usave, u, subdomain=bundle0)]

        op0 = Operator(eqns, opt=('cire-sops', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=('cire-sops', 'tasking', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 5
        assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op1)
        assert len(sections) == 3
        assert 'while(lock0[t' in str(sections[1].body[0].body[0].body[0])

        op0.apply(time_M=nt-1)
        op1.apply(time_M=nt-1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)
Пример #23
0
    def _make_parregion(self, partree, parrays):
        arrays = [i for i in FindSymbols().visit(partree) if i.is_Array]

        # Detect thread-private arrays on the heap and "map" them to shared
        # vector-expanded (one entry per thread) Arrays
        heap_private = [i for i in arrays if i._mem_heap and i._mem_local]
        heap_globals = []
        for i in heap_private:
            if i in parrays:
                pi = parrays[i]
            else:
                pi = parrays.setdefault(
                    i,
                    PointerArray(name=self.sregistry.make_name(),
                                 dimensions=(self.threadid, ),
                                 array=i))
            heap_globals.append(HeapGlobal(i, pi))
        if heap_globals:
            init = c.Initializer(
                c.Value(self.threadid._C_typedata, self.threadid.name),
                self.lang['thread-num'])
            prefix = List(header=init,
                          body=heap_globals + list(partree.prefix),
                          footer=c.Line())
            partree = partree._rebuild(prefix=prefix)

        return self.Region(partree)
Пример #24
0
    def test_streaming_postponed_deletion(self, opt, ntmps):
        nt = 10
        grid = Grid(shape=(10, 10, 10))

        u = TimeFunction(name='u', grid=grid)
        v = TimeFunction(name='v', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid)
        v1 = TimeFunction(name='v', grid=grid)

        for i in range(nt):
            usave.data[i, :] = i

        eqns = [
            Eq(u.forward, u + usave),
            Eq(v.forward, v + u.forward.dx + usave)
        ]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=opt)

        # Check generated code
        assert len(op1._func_table) == 3
        assert len([i for i in FindSymbols().visit(op1)
                    if i.is_Array]) == ntmps

        op0.apply(time_M=nt - 1)
        op1.apply(time_M=nt - 1, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)
Пример #25
0
    def test_composite_buffering_tasking(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(4, 4, 4), subdomains=bundle0)

        u = TimeFunction(name='u', grid=grid, time_order=2)
        u1 = TimeFunction(name='u', grid=grid, time_order=2)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        eqns = [Eq(u.forward, u*1.1 + 1),
                Eq(usave, u.dt2, subdomain=bundle0)]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate'))

        # Check generated code -- thanks to buffering only expect 1 lock!
        assert len(retrieve_iteration_tree(op0)) == 2
        assert len(retrieve_iteration_tree(op1)) == 5
        symbols = FindSymbols().visit(op1)
        assert len([i for i in symbols if isinstance(i, Lock)]) == 1
        threads = [i for i in symbols if isinstance(i, PThreadArray)]
        assert len(threads) == 1
        assert threads[0].size.data == 1

        op0.apply(time_M=nt-1, dt=0.1)
        op1.apply(time_M=nt-1, dt=0.1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)
Пример #26
0
    def test_nested(self):
        """
        Check that nested aliases are optimized away through "smaller" aliases.

        Examples
        --------
        Given the expression

            sqrt(cos(a[x, y]))

        We should get

            t0 = cos(a[x,y])
            t1 = sqrt(t0)
            out = t1  # pseudocode
        """
        grid = Grid(shape=(3, 3))
        x, y = grid.dimensions  # noqa

        u = TimeFunction(name='u', grid=grid)
        g = Function(name='g', grid=grid)

        op = Operator(Eq(u.forward, u + sin(cos(g)) + sin(cos(g[x+1, y+1]))))

        # We expect two temporary Arrays: `r1 = cos(g)` and `r2 = sqrt(r1)`
        arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
        assert len(arrays) == 2
        assert all(i._mem_heap and not i._mem_external for i in arrays)
Пример #27
0
    def test_tasking_in_isolation(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp = Function(name='tmp', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid)

        eqns = [Eq(tmp, v),
                Eq(v.forward, v + 1),
                Eq(u.forward, tmp, subdomain=bundle0)]

        op = Operator(eqns, opt=('tasking', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 5
        assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 3
        assert str(sections[0].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0);'
        body = sections[2].body[0].body[0]
        assert (str(body.body[1].condition) ==
                'Ne(lock0[0], 2) | Ne(FieldFromComposite(sdata0[wi0]), 1)')
        assert str(body.body[2]) == 'sdata0[wi0].time = time;'
        assert str(body.body[3]) == 'lock0[0] = 0;'
        assert str(body.body[4]) == 'sdata0[wi0].flag = 2;'

        op.apply(time_M=nt-2)

        assert np.all(u.data[nt-1] == 8)
Пример #28
0
    def test_catch_duplicate_from_different_clusters(self):
        """
        Check that the compiler is able to detect redundant aliases when these
        stem from different Clusters.
        """
        grid = Grid((10, 10))

        a = Function(name="a", grid=grid, space_order=4)
        b = Function(name="b", grid=grid, space_order=4)
        c = Function(name="c", grid=grid, space_order=4)
        d = Function(name="d", grid=grid, space_order=4)

        s = SparseTimeFunction(name="s", grid=grid, npoint=1, nt=2)
        e = TimeFunction(name="e", grid=grid, space_order=4)
        f = TimeFunction(name="f", grid=grid, space_order=4)

        deriv = (sqrt((a - 2*b)/c) * e.dx).dy + (sqrt((d - 2*c)/a) * e.dy).dx
        deriv2 = (sqrt((c - 2*b)/c) * f.dy).dx + (sqrt((d - 2*c)/a) * f.dx).dy

        eqns = ([Eq(e.forward, deriv + e)] +
                s.inject(e.forward, expr=s) +
                [Eq(f.forward, deriv2 + f + e.forward.dx)])

        op = Operator(eqns)

        arrays = [i for i in FindSymbols().visit(op) if i.is_Array]
        assert len(arrays) == 3
        assert all(i._mem_heap and not i._mem_external for i in arrays)
Пример #29
0
    def _make_parregion(self, partree, parrays):
        arrays = [i for i in FindSymbols().visit(partree) if i.is_Array]

        # Detect thread-private arrays on the heap and "map" them to shared
        # vector-expanded (one entry per thread) Arrays
        heap_private = [i for i in arrays if i._mem_heap and i._mem_local]
        heap_globals = []
        for i in heap_private:
            if i in parrays:
                pi = parrays[i]
            else:
                pi = parrays.setdefault(
                    i,
                    PointerArray(name=self.sregistry.make_name(),
                                 dimensions=(self.threadid, ),
                                 array=i))
            heap_globals.append(Dereference(i, pi))
        if heap_globals:
            body = List(header=self._make_tid(self.threadid),
                        body=heap_globals + [partree],
                        footer=c.Line())
        else:
            body = partree

        return OpenMPRegion(body, partree.nthreads)
Пример #30
0
    def _make_guard(self, parregion, *args):
        partrees = FindNodes(ParallelTree).visit(parregion)
        if not any(isinstance(i.root, self.DeviceIteration) for i in partrees):
            return super()._make_guard(parregion, *args)

        cond = []
        # There must be at least one iteration or potential crash
        if not parregion.is_Affine:
            trees = retrieve_iteration_tree(parregion.root)
            tree = trees[0][:parregion.ncollapsed]
            cond.extend([i.symbolic_size > 0 for i in tree])

        # SparseFunctions may occasionally degenerate to zero-size arrays. In such
        # a case, a copy-in produces a `nil` pointer on the device. To fire up a
        # parallel loop we must ensure none of the SparseFunction pointers are `nil`
        symbols = FindSymbols().visit(parregion)
        sfs = [i for i in symbols if i.is_SparseFunction]
        if sfs:
            size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs]
            cond.extend([i > 0 for i in size])

        # Drop dynamically evaluated conditions (e.g. because the `symbolic_size`
        # is an integer value rather than a symbol). This avoids ugly and
        # unnecessary conditionals such as `if (true) { ...}`
        cond = [i for i in cond if i != true]

        # Combine all cond elements
        if cond:
            parregion = List(body=[Conditional(And(*cond), parregion)])

        return parregion