def place_casts(self, iet, **kwargs): """ Create a new IET with the necessary type casts. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ # Candidates indexeds = FindSymbols('indexeds|indexedbases').visit(iet) # A cast is needed only if the underlying data object isn't already # defined inside the kernel, which happens, for example, when: # (i) Dereferencing a PointerArray, e.g., `float (*r0)[.] = (float(*)[.]) pr0[.]` # (ii) Declaring a raw pointer, e.g., `float * r0 = NULL; *malloc(&(r0), ...) defines = set(FindSymbols('defines').visit(iet)) needs_cast = lambda f: f.indexed not in defines # Create Function -> n-dimensional array casts # E.g. `float (*u)[.] = (float (*)[.]) u_vec->data` functions = sorted({i.function for i in indexeds}, key=lambda i: i.name) casts = [self.lang.PointerCast(f) for f in functions if needs_cast(f)] # Incorporate the newly created casts if casts: iet = iet._rebuild(body=iet.body._rebuild(casts=casts)) return iet, {}
def place_casts(self, iet, **kwargs): """ Create a new IET with the necessary type casts. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ indexeds = FindSymbols('indexeds|indexedbases').visit(iet) defines = set(FindSymbols('defines').visit(iet)) # The _C_name represents the name of the Function among the # `iet.parameters`). If this differs from the name used within the # expressions, then it implies a cast is required needs_cast = lambda f: f not in defines and f._C_name != f.name # Create Function -> n-dimensional array casts # E.g. `float (*u)[u_vec->size[1]] = (float (*)[u_vec->size[1]]) u_vec->data` functions = sorted({i.function for i in indexeds}, key=lambda i: i.name) casts = [self.lang.PointerCast(f) for f in functions if needs_cast(f)] # Incorporate the newly created casts if casts: iet = iet._rebuild(body=iet.body._rebuild(casts=casts)) return iet, {}
def place_casts(self, iet): """ Create a new IET with the necessary type casts. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ functions = FindSymbols().visit(iet) need_cast = {i for i in functions if i.is_Tensor} # Make the generated code less verbose by avoiding unnecessary casts indexed_names = {i.name for i in FindSymbols('indexeds').visit(iet)} need_cast = { i for i in need_cast if i.name in indexed_names or i.is_ArrayBasic } casts = tuple(PointerCast(i) for i in iet.parameters if i in need_cast) if casts: casts = (List(body=casts, footer=c.Line()), ) iet = iet._rebuild(body=casts + iet.body) return iet, {}
def test_tti_rewrite_aggressive(tti_nodse): operator = tti_operator(dse='aggressive') rec, u, v, _ = operator.forward(kernel='centered', save=False) assert np.allclose(tti_nodse[0].data, v.data, atol=10e-1) assert np.allclose(tti_nodse[1].data, rec.data, atol=10e-1) # Also check that DLE's loop blocking with DSE=aggressive does the right thing # There should be exactly two BlockDimensions; bugs in the past were generating # either code with no blocking (zero BlockDimensions) or code with four # BlockDimensions (i.e., Iteration folding was somewhat broken) op = operator.op_fwd(kernel='centered', save=False) block_dims = [i for i in op.dimensions if isinstance(i, BlockDimension)] assert len(block_dims) == 2 # Also, in this operator, we expect six temporary Arrays: # * four Arrays are allocated on the heap # * two Arrays are allocated on the stack and only appear within an efunc arrays = [i for i in FindSymbols().visit(op) if i.is_Array] assert len(arrays) == 4 assert all(i._mem_heap and not i._mem_external for i in arrays) arrays = [ i for i in FindSymbols().visit(op._func_table['bf0'].root) if i.is_Array ] assert len(arrays) == 6 assert all(not i._mem_external for i in arrays) assert len([i for i in arrays if i._mem_heap]) == 4 assert len([i for i in arrays if i._mem_stack]) == 2
def make_simd(self, iet, **kwargs): """ Create a new IET with SIMD parallelism via OpenMP pragmas. """ simd_reg_size = kwargs.pop('simd_reg_size') mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_Parallel] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Construct OpenMP SIMD pragma aligned = [j for j in FindSymbols('symbolics').visit(candidate) if j.is_DiscreteFunction] if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple(simd(','.join([j.name for j in aligned]), simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def _make_parallel(self, iet): mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The used `nthreads` arguments args = [i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin))] return iet, {'args': args, 'includes': ['omp.h']}
def test_streaming_multi_input(self, opt, ntmps): nt = 100 grid = Grid(shape=(10, 10)) u = TimeFunction(name='u', grid=grid, save=nt, time_order=2, space_order=2) v = TimeFunction(name='v', grid=grid, save=None, time_order=2, space_order=2) grad = Function(name='grad', grid=grid) grad1 = Function(name='grad', grid=grid) v.data[:] = 0.02 for i in range(nt): u.data[i, :] = i + 0.1 eqn = Eq(grad, grad - u.dt2 * v) op0 = Operator(eqn, opt=('noop', {'gpu-fit': u})) op1 = Operator(eqn, opt=opt) # Check generated code assert len(op1._func_table) == 3 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt - 2, dt=0.1) op1.apply(time_M=nt - 2, dt=0.1, grad=grad1) assert np.all(grad.data == grad1.data)
def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs): clauses = [] if ncollapse: clauses.append('collapse(%d)' % (ncollapse or 1)) elif tile: clauses.append('tile(%s)' % ','.join(str(i) for i in tile)) if reduction: clauses.append(make_clause_reduction(reduction)) indexeds = FindSymbols('indexeds').visit(kwargs['nodes']) deviceptrs = filter_ordered(i.name for i in indexeds if i.function._mem_local) presents = filter_ordered(i.name for i in indexeds if ( is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs)) # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used if presents: clauses.append("present(%s)" % ",".join(presents)) if deviceptrs: clauses.append("deviceptr(%s)" % ",".join(deviceptrs)) return clauses
def make_simd(self, iet): mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_ParallelRelaxed] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Only fully-parallel Iterations will be SIMD-ized (ParallelRelaxed # might not be enough then) if not candidate.is_Parallel: continue # Add SIMD pragma aligned = [j for j in FindSymbols('symbolics').visit(candidate) if j.is_DiscreteFunction] if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple(simd(','.join([j.name for j in aligned]), self.simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def test_save_w_nonaffine_time(self): factor = 4 grid = Grid(shape=(11, 11)) x, y = grid.dimensions t = grid.stepping_dim time = grid.time_dim time_subsampled = ConditionalDimension('t_sub', parent=time, factor=factor) f = Function(name='f', grid=grid, dtype=np.int32) u = TimeFunction(name='u', grid=grid) usave = TimeFunction(name='usave', grid=grid, save=2, time_dim=time_subsampled) save_shift = Constant(name='save_shift', dtype=np.int32) eqns = [ Eq(u.forward, u[t, f[x, x], f[y, y]] + 1.), Eq(usave.subs(time_subsampled, time_subsampled - save_shift), u) ] op = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate')) # We just check the generated code here assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 assert len(op._func_table) == 2
def test_unread_buffered_function(self): nt = 10 grid = Grid(shape=(4, 4)) time = grid.time_dim u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) eqns = [Eq(v.forward, v + 1, implicit_dims=time), Eq(u, v)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 1 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_async_degree(self, async_degree): nt = 10 grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) eqn = Eq(u.forward, u + 1) op0 = Operator(eqn, opt='noop') op1 = Operator(eqn, opt=('buffering', { 'buf-async-degree': async_degree })) # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 assert buffers.pop().symbolic_shape[0] == async_degree op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_hoisting_if_coupled(self): """ Test that coupled aliases are successfully hoisted out of the time loop. """ grid = Grid((10, 10)) a = Function(name="a", grid=grid, space_order=4) b = Function(name="b", grid=grid, space_order=4) e = TimeFunction(name="e", grid=grid, space_order=4) f = TimeFunction(name="f", grid=grid, space_order=4) subexpr0 = sqrt(1. + 1. / a) subexpr1 = 1 / (8. * subexpr0 - 8. / b) eqns = [ Eq(e.forward, e + 1), Eq(f.forward, f * subexpr0 - f * subexpr1 + e.forward.dx) ] op = Operator(eqns) trees = retrieve_iteration_tree(op) assert len(trees) == 3 arrays = [i for i in FindSymbols().visit(trees[0].root) if i.is_Array] assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays)
def test_read_only(): nt = 10 grid = Grid(shape=(2, 2)) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(nt): u.data[i, :] = i eqns = [Eq(v.forward, v + u.backward + u + u.forward + 1.)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, v=v1) assert np.all(v.data == v1.data)
def test_over_injection(): nt = 10 grid = Grid(shape=(4, 4)) src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=nt) rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=nt) u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) u1 = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) src.data[:] = 1. eqns = ([Eq(u.forward, u + 1)] + src.inject(field=u.forward, expr=src) + rec.interpolate(expr=u.forward)) op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) ==\ 5 + bool(configuration['language'] != 'C') buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_two_heterogeneous_buffers(): nt = 10 grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid, save=nt) v1 = TimeFunction(name='v', grid=grid, save=nt) for i in range(nt): u.data[i, :] = i u1.data[i, :] = i eqns = [Eq(u.forward, u + v + 1), Eq(v.forward, u + v + v.backward)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 2 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_read_only_backwards_unstructured(): """ Instead of the class `time-1`, `time`, and `time+1`, here we access the buffered Function via `time-2`, `time-1` and `time+2`. """ nt = 10 grid = Grid(shape=(2, 2)) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(nt): u.data[i, :] = i eqns = [ Eq(v.backward, v + u.backward.backward + u.backward + u.forward.forward + 1.) ] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_m=2) op1.apply(time_m=2, v=v1) assert np.all(v.data == v1.data)
def test_tasking_unfused_two_locks(self): nt = 10 bundle0 = Bundle() grid = Grid(shape=(10, 10, 10), subdomains=bundle0) tmp0 = Function(name='tmp0', grid=grid) tmp1 = Function(name='tmp1', grid=grid) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid, save=nt) w = TimeFunction(name='w', grid=grid) eqns = [ Eq(w.forward, w + 1), Eq(tmp0, w.forward), Eq(tmp1, w.forward), Eq(u.forward, tmp0, subdomain=bundle0), Eq(v.forward, tmp1, subdomain=bundle0) ] op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op)) == 7 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 2 sections = FindNodes(Section).visit(op) assert len(sections) == 4 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock body = sections[2].body[0].body[0] assert (str(body.body[1].condition) == 'Ne(lock0[0], 2) | Ne(FieldFromComposite(sdata0[wi0]), 1)' ) # Wait-thread assert (str(body.body[1].body[0]) == 'wi0 = (wi0 + 1)%(npthreads0);') assert str(body.body[2]) == 'sdata0[wi0].time = time;' assert str(body.body[3]) == 'lock0[0] = 0;' # Set-lock assert str(body.body[4]) == 'sdata0[wi0].flag = 2;' body = sections[3].body[0].body[0] assert (str(body.body[1].condition) == 'Ne(lock1[0], 2) | Ne(FieldFromComposite(sdata1[wi1]), 1)' ) # Wait-thread assert (str(body.body[1].body[0]) == 'wi1 = (wi1 + 1)%(npthreads1);') assert str(body.body[2]) == 'sdata1[wi1].time = time;' assert str(body.body[3]) == 'lock1[0] = 0;' # Set-lock assert str(body.body[4]) == 'sdata1[wi1].flag = 2;' assert len(op._func_table) == 4 exprs = FindNodes(Expression).visit( op._func_table['copy_device_to_host0'].root) assert len(exprs) == 18 assert str(exprs[14]) == 'lock0[0] = 1;' assert exprs[15].write is u exprs = FindNodes(Expression).visit( op._func_table['copy_device_to_host1'].root) assert str(exprs[14]) == 'lock1[0] = 1;' assert exprs[15].write is v op.apply(time_M=nt - 2) assert np.all(u.data[nt - 1] == 9) assert np.all(v.data[nt - 1] == 9)
def test_composite_full(self): nt = 10 grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) v1 = TimeFunction(name='v', grid=grid, save=nt) eqns = [Eq(u.forward, u + v + 1), Eq(v.forward, u + v + v.backward)] op0 = Operator(eqns, opt=('noop', {'gpu-fit': (u, v)})) op1 = Operator(eqns, opt=('buffering', 'tasking', 'streaming', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op1)) == 9 assert len( [i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 2 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_subdimensions(self): nt = 10 grid = Grid(shape=(10, 10, 10)) x, y, z = grid.dimensions xi = SubDimension.middle(name='xi', parent=x, thickness_left=2, thickness_right=2) yi = SubDimension.middle(name='yi', parent=y, thickness_left=2, thickness_right=2) zi = SubDimension.middle(name='zi', parent=z, thickness_left=2, thickness_right=2) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) eqn = Eq(u.forward, u + 1).xreplace({x: xi, y: yi, z: zi}) op0 = Operator(eqn, opt='noop') op1 = Operator(eqn, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_composite_streaming_tasking(self): nt = 10 grid = Grid(shape=(10, 10, 10)) u = TimeFunction(name='u', grid=grid) u1 = TimeFunction(name='u', grid=grid) fsave = TimeFunction(name='fsave', grid=grid, save=nt) usave = TimeFunction(name='usave', grid=grid, save=nt) usave1 = TimeFunction(name='usave', grid=grid, save=nt) for i in range(nt): fsave.data[i, :] = i eqns = [Eq(u.forward, u + fsave + 1), Eq(usave, u)] op0 = Operator(eqns, opt=('noop', {'gpu-fit': (fsave, usave)})) op1 = Operator(eqns, opt=('tasking', 'streaming', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op0)) == 1 assert len(retrieve_iteration_tree(op1)) == 4 symbols = FindSymbols().visit(op1) assert len([i for i in symbols if isinstance(i, Lock)]) == 1 threads = [i for i in symbols if isinstance(i, PThreadArray)] assert len(threads) == 2 assert threads[0].size == 1 assert threads[1].size.data == 2 op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) assert np.all(u.data == u1.data) assert np.all(usave.data == usave1.data)
def test_tasking_over_compiler_generated(self): nt = 10 bundle0 = Bundle() grid = Grid(shape=(4, 4, 4), subdomains=bundle0) u = TimeFunction(name='u', grid=grid, space_order=2) u1 = TimeFunction(name='u', grid=grid, space_order=2) usave = TimeFunction(name='usave', grid=grid, save=nt) usave1 = TimeFunction(name='usave', grid=grid, save=nt) eqns = [Eq(u.forward, u.dx.dx*0.042 + 1), Eq(usave, u, subdomain=bundle0)] op0 = Operator(eqns, opt=('cire-sops', {'gpu-fit': usave})) op1 = Operator(eqns, opt=('cire-sops', 'tasking', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op1)) == 5 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op1) assert len(sections) == 3 assert 'while(lock0[t' in str(sections[1].body[0].body[0].body[0]) op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) assert np.all(u.data == u1.data) assert np.all(usave.data == usave1.data)
def _make_parregion(self, partree, parrays): arrays = [i for i in FindSymbols().visit(partree) if i.is_Array] # Detect thread-private arrays on the heap and "map" them to shared # vector-expanded (one entry per thread) Arrays heap_private = [i for i in arrays if i._mem_heap and i._mem_local] heap_globals = [] for i in heap_private: if i in parrays: pi = parrays[i] else: pi = parrays.setdefault( i, PointerArray(name=self.sregistry.make_name(), dimensions=(self.threadid, ), array=i)) heap_globals.append(HeapGlobal(i, pi)) if heap_globals: init = c.Initializer( c.Value(self.threadid._C_typedata, self.threadid.name), self.lang['thread-num']) prefix = List(header=init, body=heap_globals + list(partree.prefix), footer=c.Line()) partree = partree._rebuild(prefix=prefix) return self.Region(partree)
def test_streaming_postponed_deletion(self, opt, ntmps): nt = 10 grid = Grid(shape=(10, 10, 10)) u = TimeFunction(name='u', grid=grid) v = TimeFunction(name='v', grid=grid) usave = TimeFunction(name='usave', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(nt): usave.data[i, :] = i eqns = [ Eq(u.forward, u + usave), Eq(v.forward, v + u.forward.dx + usave) ] op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave})) op1 = Operator(eqns, opt=opt) # Check generated code assert len(op1._func_table) == 3 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt - 1) op1.apply(time_M=nt - 1, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_composite_buffering_tasking(self): nt = 10 bundle0 = Bundle() grid = Grid(shape=(4, 4, 4), subdomains=bundle0) u = TimeFunction(name='u', grid=grid, time_order=2) u1 = TimeFunction(name='u', grid=grid, time_order=2) usave = TimeFunction(name='usave', grid=grid, save=nt) usave1 = TimeFunction(name='usave', grid=grid, save=nt) eqns = [Eq(u.forward, u*1.1 + 1), Eq(usave, u.dt2, subdomain=bundle0)] op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave})) op1 = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate')) # Check generated code -- thanks to buffering only expect 1 lock! assert len(retrieve_iteration_tree(op0)) == 2 assert len(retrieve_iteration_tree(op1)) == 5 symbols = FindSymbols().visit(op1) assert len([i for i in symbols if isinstance(i, Lock)]) == 1 threads = [i for i in symbols if isinstance(i, PThreadArray)] assert len(threads) == 1 assert threads[0].size.data == 1 op0.apply(time_M=nt-1, dt=0.1) op1.apply(time_M=nt-1, dt=0.1, u=u1, usave=usave1) assert np.all(u.data == u1.data) assert np.all(usave.data == usave1.data)
def test_nested(self): """ Check that nested aliases are optimized away through "smaller" aliases. Examples -------- Given the expression sqrt(cos(a[x, y])) We should get t0 = cos(a[x,y]) t1 = sqrt(t0) out = t1 # pseudocode """ grid = Grid(shape=(3, 3)) x, y = grid.dimensions # noqa u = TimeFunction(name='u', grid=grid) g = Function(name='g', grid=grid) op = Operator(Eq(u.forward, u + sin(cos(g)) + sin(cos(g[x+1, y+1])))) # We expect two temporary Arrays: `r1 = cos(g)` and `r2 = sqrt(r1)` arrays = [i for i in FindSymbols().visit(op) if i.is_Array] assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays)
def test_tasking_in_isolation(self): nt = 10 bundle0 = Bundle() grid = Grid(shape=(10, 10, 10), subdomains=bundle0) tmp = Function(name='tmp', grid=grid) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) eqns = [Eq(tmp, v), Eq(v.forward, v + 1), Eq(u.forward, tmp, subdomain=bundle0)] op = Operator(eqns, opt=('tasking', 'orchestrate')) # Check generated code assert len(retrieve_iteration_tree(op)) == 5 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 3 assert str(sections[0].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0);' body = sections[2].body[0].body[0] assert (str(body.body[1].condition) == 'Ne(lock0[0], 2) | Ne(FieldFromComposite(sdata0[wi0]), 1)') assert str(body.body[2]) == 'sdata0[wi0].time = time;' assert str(body.body[3]) == 'lock0[0] = 0;' assert str(body.body[4]) == 'sdata0[wi0].flag = 2;' op.apply(time_M=nt-2) assert np.all(u.data[nt-1] == 8)
def test_catch_duplicate_from_different_clusters(self): """ Check that the compiler is able to detect redundant aliases when these stem from different Clusters. """ grid = Grid((10, 10)) a = Function(name="a", grid=grid, space_order=4) b = Function(name="b", grid=grid, space_order=4) c = Function(name="c", grid=grid, space_order=4) d = Function(name="d", grid=grid, space_order=4) s = SparseTimeFunction(name="s", grid=grid, npoint=1, nt=2) e = TimeFunction(name="e", grid=grid, space_order=4) f = TimeFunction(name="f", grid=grid, space_order=4) deriv = (sqrt((a - 2*b)/c) * e.dx).dy + (sqrt((d - 2*c)/a) * e.dy).dx deriv2 = (sqrt((c - 2*b)/c) * f.dy).dx + (sqrt((d - 2*c)/a) * f.dx).dy eqns = ([Eq(e.forward, deriv + e)] + s.inject(e.forward, expr=s) + [Eq(f.forward, deriv2 + f + e.forward.dx)]) op = Operator(eqns) arrays = [i for i in FindSymbols().visit(op) if i.is_Array] assert len(arrays) == 3 assert all(i._mem_heap and not i._mem_external for i in arrays)
def _make_parregion(self, partree, parrays): arrays = [i for i in FindSymbols().visit(partree) if i.is_Array] # Detect thread-private arrays on the heap and "map" them to shared # vector-expanded (one entry per thread) Arrays heap_private = [i for i in arrays if i._mem_heap and i._mem_local] heap_globals = [] for i in heap_private: if i in parrays: pi = parrays[i] else: pi = parrays.setdefault( i, PointerArray(name=self.sregistry.make_name(), dimensions=(self.threadid, ), array=i)) heap_globals.append(Dereference(i, pi)) if heap_globals: body = List(header=self._make_tid(self.threadid), body=heap_globals + [partree], footer=c.Line()) else: body = partree return OpenMPRegion(body, partree.nthreads)
def _make_guard(self, parregion, *args): partrees = FindNodes(ParallelTree).visit(parregion) if not any(isinstance(i.root, self.DeviceIteration) for i in partrees): return super()._make_guard(parregion, *args) cond = [] # There must be at least one iteration or potential crash if not parregion.is_Affine: trees = retrieve_iteration_tree(parregion.root) tree = trees[0][:parregion.ncollapsed] cond.extend([i.symbolic_size > 0 for i in tree]) # SparseFunctions may occasionally degenerate to zero-size arrays. In such # a case, a copy-in produces a `nil` pointer on the device. To fire up a # parallel loop we must ensure none of the SparseFunction pointers are `nil` symbols = FindSymbols().visit(parregion) sfs = [i for i in symbols if i.is_SparseFunction] if sfs: size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs] cond.extend([i > 0 for i in size]) # Drop dynamically evaluated conditions (e.g. because the `symbolic_size` # is an integer value rather than a symbol). This avoids ugly and # unnecessary conditionals such as `if (true) { ...}` cond = [i for i in cond if i != true] # Combine all cond elements if cond: parregion = List(body=[Conditional(And(*cond), parregion)]) return parregion