def _make_compute(self, hs, key, msgs, callpoke): if hs.body.is_Call: return None else: mapper = {i: List(body=[callpoke, i]) for i in FindNodes(ExpressionBundle).visit(hs.body)} iet = Transformer(mapper).visit(hs.body) return make_efunc('compute%d' % key, iet, hs.arguments)
def _build_casts(self, iet): """Introduce array and pointer casts at the top of the Iteration/Expression tree ``iet``.""" casts = [ ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external ] casts.append(PointerCast(Timer(self.profiler))) return List(body=casts + [iet])
def iet_insert_decls(iet, external): """ Transform the input IET inserting the necessary symbol declarations. Declarations are placed as close as possible to the first symbol occurrence. Parameters ---------- iet : Node The input Iteration/Expression tree. external : tuple, optional The symbols defined in some outer Callable, which therefore must not be re-defined. """ iet = as_tuple(iet) # Classify and then schedule declarations to stack/heap allocator = Allocator() for k, v in MapExprStmts().visit(iet).items(): if k.is_Expression: if k.is_definition: # On the stack site = v if v else iet allocator.push_scalar_on_stack(site[-1], k) continue objs = [k.write] elif k.is_Call: objs = k.arguments for i in objs: try: if i.is_LocalObject: # On the stack site = v if v else iet allocator.push_object_on_stack(site[-1], i) elif i.is_Array: if i in as_tuple(external): # The Array is defined in some other IET continue elif i._mem_stack: # On the stack allocator.push_object_on_stack(iet[0], i) else: # On the heap allocator.push_array_on_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack mapper = dict(allocator.onstack) iet = Transformer(mapper, nested=True).visit(iet) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def _make_sendrecv(self, f, hse, key, **kwargs): comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in hse.loc_indices ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') gather = Call('gather_%s' % key, [bufg] + list(bufg.shape) + [f] + ofsg) scatter = Call('scatter_%s' % key, [bufs] + list(bufs.shape) + [f] + ofss) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) count = reduce(mul, bufs.shape, 1) rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % key, iet, 'void', parameters, ('static', ))
def _avoid_denormals(self, nodes, state): """ Introduce nodes in the Iteration/Expression tree that will expand to C macros telling the CPU to flush denormal numbers in hardware. Denormals are normally flushed when using SSE-based instruction sets, except when compiling shared objects. """ return (List(body=(Denormals(), nodes)), {'includes': ('xmmintrin.h', 'pmmintrin.h')})
def _avoid_denormals(self, iet): header = [ cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement( '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)') ] iet = List(header=header, body=iet) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) osdd_then = Call('omp_set_default_device', [deviceid]) osdd_else = Call('omp_set_default_device', [rank % ngpus]) body = [ Conditional( CondNe(deviceid, -1), osdd_then, List(body=[rank_decl, rank_init, ngpus_init, osdd_else]), ) ] else: body = [ Conditional(CondNe(deviceid, -1), Call('omp_set_default_device', [deviceid])) ] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def make(self, hs): """ Construct Callables and Calls implementing distributed-memory halo exchange for the HaloSpot ``hs``. """ # Sanity check assert all(f.is_Function and f.grid is not None for f in hs.fmapper) for f, hse in hs.fmapper.items(): # Build an MPIMsg, a data structure to be propagated across the # various halo exchange routines if (f, hse) not in self._msgs: key = self._gen_msgkey() msg = self._msgs.setdefault((f, hse), self._make_msg(f, hse, key)) else: msg = self._msgs[(f, hse)] # Callables for send/recv/wait if (f.ndim, hse) not in self._cache_halo: self._make_all(f, hse, msg) msgs = [self._msgs[(f, hse)] for f, hse in hs.fmapper.items()] # Callable for poking the asynchronous progress engine key = self._gen_compkey() poke = self._make_poke(hs, key, msgs) if poke is not None: self._efuncs.append(poke) # Callable for compute over the CORE region callpoke = self._call_poke(poke) compute = self._make_compute(hs, key, msgs, callpoke) if compute is not None: self._efuncs.append(compute) # Callable for compute over the OWNED region region = self._make_region(hs, key) region = self._regions.setdefault(hs, region) callcompute = self._call_compute(hs, compute, msgs) remainder = self._make_remainder(hs, key, callcompute, region) if remainder is not None: self._efuncs.append(remainder) # Now build up the HaloSpot body, with explicit Calls to the constructed Callables body = [callcompute] for i, (f, hse) in enumerate(hs.fmapper.items()): msg = self._msgs[(f, hse)] haloupdate, halowait = self._cache_halo[(f.ndim, hse)] body.insert(i, self._call_haloupdate(haloupdate.name, f, hse, msg)) if halowait is not None: body.append(self._call_halowait(halowait.name, f, hse, msg)) if remainder is not None: body.append(self._call_remainder(remainder)) return List(body=body)
def _padding(self, nodes, state): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(nodes) if not handle: return nodes, {} shape = max([i.shape for i in handle], key=len) if not shape: return nodes, {} candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: return nodes, {} # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(nodes) exprs = [e for e in exprs if e.write in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, Array(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = SubstituteExpression(mapper).visit(nodes) # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicFunction]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = List(body=init + as_tuple(processed) + copyback) return processed, {}
def instrument(self, iet): sections = FindNodes(Section).visit(iet) # Transform the Iteration/Expression tree introducing Advisor calls that # resume and stop data collection mapper = {i: List(body=[Call(self._api_resume), i, Call(self._api_pause)]) for i in sections} iet = Transformer(mapper).visit(iet) return iet
def _build_casts(self, nodes): """Introduce array and pointer casts at the top of the Iteration/Expression tree ``nodes``.""" casts = [ ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external ] profiler = Object(self.profiler.name, self.profiler.dtype, self.profiler.new) casts.append(PointerCast(profiler)) return List(body=casts + [nodes])
def iet_insert_C_decls(iet, func_table): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: A mapper from callable names to :class:`Callable`s called from within ``iet``. """ # Resolve function calls first scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table[k.name] if func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write is None or k.write._mem_external: # Nothing to do, e.g., variable passed as kernel argument continue elif k.write._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = NestedTransformer(mapper).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) ops_block = OpsBlock('block') # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel = opsit(trees, n) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def update_halo(f, fixed): """ Construct an IET performing a halo exchange for a :class:`TensorFunction`. """ # Requirements assert f.is_Function assert f.grid is not None distributor = f.grid.distributor nb = distributor._C_neighbours.obj comm = distributor._C_comm fixed = {d: Symbol(name="o%s" % d.root) for d in fixed} mapper = get_views(f, fixed) body = [] masks = [] for d in f.dimensions: if d in fixed: continue rpeer = FieldFromPointer("%sright" % d, nb) lpeer = FieldFromPointer("%sleft" % d, nb) # Sending to left, receiving from right lsizes, loffsets = mapper[(d, LEFT, OWNED)] rsizes, roffsets = mapper[(d, RIGHT, HALO)] assert lsizes == rsizes sizes = lsizes parameters = ([f] + list(f.symbolic_shape) + sizes + loffsets + roffsets + [rpeer, lpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sl' % d) body.append(Conditional(mask, call)) masks.append(mask) # Sending to right, receiving from left rsizes, roffsets = mapper[(d, RIGHT, OWNED)] lsizes, loffsets = mapper[(d, LEFT, HALO)] assert rsizes == lsizes sizes = rsizes parameters = ([f] + list(f.symbolic_shape) + sizes + roffsets + loffsets + [lpeer, rpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sr' % d) body.append(Conditional(mask, call)) masks.append(mask) iet = List(body=body) parameters = ([f] + masks + [comm, nb] + list(fixed.values()) + [d.symbolic_size for d in f.dimensions]) return Callable('halo_exchange_%s' % f.name, iet, 'void', parameters, ('static', ))
def process(self, iet): def key(s): # The SyncOps are to be processed in the following order return [WaitLock, WithLock, Delete, FetchWait, FetchWaitPrefetch].index(s) callbacks = { WaitLock: self._make_waitlock, WithLock: self._make_withlock, FetchWait: self._make_fetchwait, FetchWaitPrefetch: self._make_fetchwaitprefetch, Delete: self._make_delete } sync_spots = FindNodes(SyncSpot).visit(iet) if not sync_spots: return iet, {} pieces = namedtuple('Pieces', 'init finalize funcs threads')([], [], [], []) subs = {} for n in sync_spots: mapper = as_mapper(n.sync_ops, lambda i: type(i)) for _type in sorted(mapper, key=key): subs[n] = callbacks[_type](subs.get(n, n), mapper[_type], pieces, iet) iet = Transformer(subs).visit(iet) # Add initialization and finalization code init = List(body=pieces.init, footer=c.Line()) finalize = List(header=c.Line(), body=pieces.finalize) iet = iet._rebuild(body=(init, ) + iet.body + (finalize, )) return iet, { 'efuncs': pieces.funcs, 'includes': ['pthread.h'], 'args': [i.size for i in pieces.threads if not is_integer(i.size)] }
def _schedule_expressions(self, clusters): """Create an Iteartion/Expression tree given an iterable of :class:`Cluster` objects.""" # Build the Iteration/Expression tree processed = [] schedule = OrderedDict() for i in clusters: # Build the Expression objects to be inserted within an Iteration tree expressions = [ Expression(v, np.int32 if i.trace.is_index(k) else self.dtype) for k, v in i.trace.items() ] if not i.stencil.empty: root = None entries = i.stencil.entries # Can I reuse any of the previously scheduled Iterations ? index = 0 for j0, j1 in zip(entries, list(schedule)): if j0 != j1 or j0.dim in clusters.atomics[i]: break root = schedule[j1] index += 1 needed = entries[index:] # Build and insert the required Iterations iters = [ Iteration([], j.dim, j.dim.limits, offsets=j.ofs) for j in needed ] body, tree = compose_nodes(iters + [expressions], retrieve=True) scheduling = OrderedDict(zip(needed, tree)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict( list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend(expressions) return List(body=processed)
def _make_haloupdate(self, f, hse, key, **kwargs): distributor = f.grid.distributor nb = distributor._obj_neighborhood comm = distributor._obj_comm sendrecv = self._cache_dims[f.dimensions][0] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} # Build a mapper `(dim, side, region) -> (size, ofs)` for `f`. `size` and # `ofs` are symbolic objects. This mapper tells what data values should be # sent (OWNED) or received (HALO) given dimension and side mapper = {} for d0, side, region in product(f.dimensions, (LEFT, RIGHT), (OWNED, HALO)): if d0 in fixed: continue sizes = [] ofs = [] for d1 in f.dimensions: if d1 in fixed: ofs.append(fixed[d1]) else: meta = f._C_get_field(region if d0 is d1 else NOPAD, d1, side) ofs.append(meta.offset) sizes.append(meta.size) mapper[(d0, side, region)] = (sizes, ofs) body = [] for d in f.dimensions: if d in fixed: continue name = ''.join('r' if i is d else 'c' for i in distributor.dimensions) rpeer = FieldFromPointer(name, nb) name = ''.join('l' if i is d else 'c' for i in distributor.dimensions) lpeer = FieldFromPointer(name, nb) if (d, LEFT) in hse.halos: # Sending to left, receiving from right lsizes, lofs = mapper[(d, LEFT, OWNED)] rsizes, rofs = mapper[(d, RIGHT, HALO)] args = [f, lsizes, lofs, rofs, rpeer, lpeer, comm] body.append(self._call_sendrecv(sendrecv.name, *args, **kwargs)) if (d, RIGHT) in hse.halos: # Sending to right, receiving from left rsizes, rofs = mapper[(d, RIGHT, OWNED)] lsizes, lofs = mapper[(d, LEFT, HALO)] args = [f, rsizes, rofs, lofs, lpeer, rpeer, comm] body.append(self._call_sendrecv(sendrecv.name, *args, **kwargs)) iet = List(body=body) parameters = [f, comm, nb] + list(fixed.values()) return HaloUpdate(key, iet, parameters)
def _make_withlock(self, iet, sync_ops, pieces, root): # Sorting for deterministic code gen locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) # The `min` is used to pick the maximum possible degree of parallelism. # For example, assume there are two locks in the given `sync_ops`, `lock0(i)` # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function # `u`, while `lock1` protects 2 entries of the Function `v`, then there # will never be more than 2 threads in flight concurrently npthreads = min(i.size for i in locks) preactions = [BlankLine] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] update = PragmaTransfer(self.lang._map_update_host_async, s.target, imask=imask, queueid=SharedData._field_id) preactions.append(update) wait = self.lang._map_wait(SharedData._field_id) if wait is not None: preactions.append(Pragma(wait)) preactions.extend([DummyExpr(s.handle, 1) for s in sync_ops]) preactions.append(BlankLine) postactions = [BlankLine] postactions.extend([DummyExpr(s.handle, 2) for s in sync_ops]) # Turn `iet` into a ThreadFunction so that it can be executed # asynchronously by a pthread in the `npthreads` pool name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, npthreads, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Schedule computation to the first available thread iet = tctx.activate # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def _make_fetchwait(self, iet, sync_ops, *args): # Construct fetches fetches = [] for s in sync_ops: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetches.append(self.lang._map_to(s.function, imask)) # Glue together the new IET pieces iet = List(header=fetches, body=iet) return iet
def _make_sendrecv(self, f, hse, key='', msg=None): comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) bufs = FieldFromPointer(msg._C_field_bufs, msg) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') sizes = [ FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions)) ] gather = Call('gather%s' % key, [bufg] + sizes + [f] + ofsg) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) iet = List(body=[recv, gather, send]) iet = List(body=iet_insert_C_decls(iet)) parameters = ([f] + ofsg + [fromrank, torank, comm, msg]) return Callable('sendrecv%s' % key, iet, 'void', parameters, ('static', ))
def avoid_denormals(iet): """ Introduce nodes in the Iteration/Expression tree that will expand to C macros telling the CPU to flush denormal numbers in hardware. Denormals are normally flushed when using SSE-based instruction sets, except when compiling shared objects. """ header = ( cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')) iet = iet._rebuild(body=(List(header=header), ) + iet.body) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def _make_waitprefetch(self, iet, sync_ops, pieces, *args): ff = SharedData._field_flag waits = [] objs = filter_ordered(pieces.objs.get(s) for s in sync_ops) for sdata, threads in objs: wait = BusyWait( CondNe(FieldFromComposite(ff, sdata[threads.index]), 1)) waits.append(wait) iet = List(header=c.Comment("Wait for the arrival of prefetched data"), body=waits + [BlankLine, iet]) return iet
def instrument(self, iet, timer): # Look for the presence of a time loop within the IET of the Operator mapper = {} for i in FindNodes(Iteration).visit(iet): if i.dim.is_Time: # The calls to Advisor's Collection Control API are only for Operators # with a time loop mapper[i] = List(header=c.Statement('%s()' % self._api_resume), body=i, footer=c.Statement('%s()' % self._api_pause)) return Transformer(mapper).visit(iet) # Return the IET intact if no time loop is found return iet
def _build_casts(self, iet): iet = super(Operator, self)._build_casts(iet) # Add YASK solution pointer for use in C-land soln_obj = Object(namespace['code-soln-name'], namespace['type-solution']) # Add YASK user and local grids pointers for use in C-land grid_objs = [YaskGridObject(i.name) for i in self.input if i.from_YASK] grid_objs.extend([YaskGridObject(i) for i in self.yk_soln.local_grids]) # Build pointer casts casts = [PointerCast(soln_obj)] + [PointerCast(i) for i in grid_objs] return List(body=casts + [iet])
def _generate_mpi(self, iet, **kwargs): if configuration['mpi'] is False: return iet halo_spots = FindNodes(HaloSpot).visit(iet) # For each MPI-distributed TensorFunction, generate all necessary # C-level routines to perform a halo update callables = OrderedDict() for hs in halo_spots: for f, v in hs.fmapper.items(): callables[f] = [update_halo(f, v.loc_indices)] callables[f].append(sendrecv(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices, True)) callables = flatten(callables.values()) # Replace HaloSpots with suitable calls performing the halo update mapper = {} for hs in halo_spots: for f, v in hs.fmapper.items(): stencil = [int(i) for i in hs.mask[f].values()] comm = f.grid.distributor._C_comm nb = f.grid.distributor._C_neighbours.obj loc_indices = list(v.loc_indices.values()) dsizes = [d.symbolic_size for d in f.dimensions] parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes call = Call('halo_exchange_%s' % f.name, parameters) mapper.setdefault(hs, []).append(call) # Sorting is for deterministic code generation. However, in practice, # we don't expect `cstructs` to contain more than one element because # there should always be one grid per Operator (though we're not really # enforcing it) cstructs = { f.grid.distributor._C_neighbours.cdef for f in flatten(i.fmapper for i in halo_spots) } self._globals.extend(sorted(cstructs, key=lambda i: i.tpname)) self._includes.append('mpi.h') self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in callables])) # Add in the halo update calls mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()} iet = Transformer(mapper, nested=True).visit(iet) return iet
def test_create_elemental_functions_simple(simple_function): roots = [i[-1] for i in retrieve_iteration_tree(simple_function)] retagged = [i._rebuild(properties=tagger(0)) for i in roots] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(simple_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size); } } } void f_0(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i, const int i_size, const int j, const int j_size, const int k_size) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""")
def _build_casts(self, iet): iet = super(Operator, self)._build_casts(iet) # Add YASK solution pointer for use in C-land soln_objs = [YaskSolnObject(cname) for _, cname in self.yk_solns] # Add YASK user and local grids pointers for use in C-land grid_objs = [YaskGridObject(i.name) for i in self.input if i.from_YASK] grid_objs.extend([YaskGridObject(i) for i in self._local_grids]) # Build pointer casts casts = [PointerCast(i) for i in soln_objs] + [PointerCast(i) for i in grid_objs] return List(body=casts + [iet])
def _make_prefetchupdate(self, iet, sync_ops, pieces, root): fid = SharedData._field_id postactions = [] for s in sync_ops: # `pcond` is not None, but we won't use it here because the condition # is actually already encoded in `iet` itself (it stems from the # originating Cluster's guards) assert s.pcond is not None imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = List( header=self.lang._map_update_wait_device(s.target, imask, fid)) postactions.append(prefetch) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(body=iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # The IET degenerates to the threads activation logic iet = tctx.activate # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def _make_delete(self, iet, sync_ops, *args): # Construct deletion clauses deletions = [] for s in sync_ops: dimensions = s.dimensions fc = s.fetch imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] deletions.append(self.lang._map_delete(s.function, imask)) # Glue together the new IET pieces iet = List(header=c.Line(), body=iet, footer=[c.Line()] + deletions) return iet
def _initialize(iet): comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) body = [rank_decl, rank_init, ngpus_init, devicenum_init] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) else: devicenum_init = LocalExpression(DummyEq(devicenum, 0)) body = [devicenum_init] init = List(header=c.Comment('Begin of OpenMP setup'), body=body, footer=(c.Comment('End of OpenMP setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet