def _initialize(iet): comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) set_device_num = Call('omp_set_default_device', [rank % ngpus]) body = [rank_decl, rank_init, ngpus_init, set_device_num] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init,) + iet.body) return iet
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() device_nvidia = Macro('acc_device_nvidia') if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia]) asdn_else = Call('acc_set_device_num', [rank % ngpus, device_nvidia]) body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), asdn_then, List(body=[rank_decl, rank_init, ngpus_init, asdn_else])) ] else: body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), Call('acc_set_device_num', [deviceid, device_nvidia])) ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def push_scalar_on_stack(self, scope, expr): """Define a Scalar on the stack.""" handle = self.stack.setdefault(scope, OrderedDict()) obj = expr.write if obj in handle: return handle[obj] = None # Placeholder to avoid reallocation self.stack[expr] = LocalExpression(**expr.args)
def iet_insert_C_decls(iet, func_table): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: A mapper from callable names to :class:`Callable`s called from within ``iet``. """ # Resolve function calls first scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table[k.name] if func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write is None or k.write._mem_external: # Nothing to do, e.g., variable passed as kernel argument continue elif k.write._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = NestedTransformer(mapper).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def _make_withlock(self, iet, sync_ops, pieces, root): # Sorting for deterministic code gen locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) # The `min` is used to pick the maximum possible degree of parallelism. # For example, assume there are two locks in the given `sync_ops`, `lock0(i)` # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function # `u`, while `lock1` protects 2 entries of the Function `v`, then there # will never be more than 2 threads in flight concurrently npthreads = min(i.size for i in locks) preactions = [] postactions = [] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] update = List(header=self.lang._map_update_wait_host( s.target, imask, SharedData._field_id)) preactions.append( List(body=[BlankLine, update, DummyExpr(s.handle, 1)])) postactions.append(DummyExpr(s.handle, 2)) preactions.append(BlankLine) postactions.insert(0, BlankLine) # Turn `iet` into a ThreadFunction so that it can be executed # asynchronously by a pthread in the `npthreads` pool name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, npthreads, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Schedule computation to the first available thread iet = tctx.activate # Initialize the locks for i in locks: values = np.full(i.shape, 2, dtype=np.int32).tolist() pieces.init.append( LocalExpression(DummyEq(i, ListInitializer(values)))) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def make_parallel(self, iet): """ Transform ``iet`` by decorating its parallel :class:`Iteration`s with suitable ``#pragma omp ...`` for thread-level parallelism. """ # Group sequences of loops that should go within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(iet): # Determine the number of consecutive parallelizable Iterations candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged mapper = OrderedDict() for group in groups.values(): private = [] for root, candidates in group.items(): mapper.update(self._make_parallel_tree(root, candidates)) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=self.lang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region processed = Transformer(mapper).visit(iet) # Hack/workaround to the fact that the OpenMP pragmas are not true # IET nodes, so the `nthreads` variables won't be detected as a # Callable parameter unless inserted in a mock Expression if mapper: nt = NThreads() eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32), nt)) return List(body=[eq, processed]), {'input': [nt]} else: return List(body=processed), {}
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
def _make_withlock(self, iet, sync_ops, pieces, root): locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) threads = self.__make_threads(value=min(i.size for i in locks)) preactions = [] postactions = [] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] preactions.append( List(body=[ BlankLine, List(header=self._P._map_update_wait_host( s.target, imask, SharedData._field_id)), DummyExpr(s.handle, 1) ])) postactions.append(DummyExpr(s.handle, 2)) preactions.append(BlankLine) postactions.insert(0, BlankLine) # Turn `iet` into an ElementalFunction so that it can be # executed asynchronously by `threadhost` name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Schedule computation to the first available thread iet = self.__make_activate_thread(threads, sdata, sync_ops) # Initialize the locks for i in locks: values = np.full(i.shape, 2, dtype=np.int32).tolist() pieces.init.append( LocalExpression(DummyEq(i, ListInitializer(values)))) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def _insert_declarations(self, nodes): """Populate the Operator's body with the necessary variable declarations.""" # Resolve function calls first scopes = [] me = MapExpressions() for k, v in me.visit(nodes).items(): if k.is_Call: func = self.func_table[k.name] if func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write._mem_external: # Nothing to do, variable passed as kernel argument continue elif k.write._mem_stack: # On the stack, as established by the DLE key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [nodes] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) nodes = NestedTransformer(mapper).visit(nodes) for k, v in list(self.func_table.items()): if v.local: self.func_table[k] = FunMeta( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) nodes = List(header=decls + allocs, body=nodes, footer=frees) return nodes
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def iet_insert_C_decls(iet, external=None): """ Given an IET, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol occurrence. Parameters ---------- iet : Node The input Iteration/Expression tree. external : tuple, optional The symbols defined in some outer Callable, which therefore must not be re-defined. """ external = external or [] # Classify and then schedule declarations to stack/heap allocator = Allocator() mapper = OrderedDict() for k, v in MapExpressions().visit(iet).items(): if k.is_Expression: if k.is_scalar_assign: # Inline declaration mapper[k] = LocalExpression(**k.args) continue objs = [k.write] elif k.is_Call: objs = k.params for i in objs: try: if i.is_LocalObject: # On the stack site = v[-1] if v else iet allocator.push_stack(site, i) elif i.is_Array: if i in external: # The Array is to be defined in some foreign IET continue elif i._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], i) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = Transformer(mapper, nested=True).visit(iet) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def iet_insert_C_decls(iet, func_table=None): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: (Optional) a mapper from callable names within ``iet`` to :class:`Callable`s. """ func_table = func_table or {} allocator = Allocator() mapper = OrderedDict() # Detect all IET nodes accessing symbols that need to be declared scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table.get(k.name) if func is not None and func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) scopes.append((k, v)) # Classify, and then schedule declarations to stack/heap for k, v in scopes: if k.is_Expression: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) continue objs = [k.write] elif k.is_Call: objs = k.params else: raise NotImplementedError("Cannot schedule declarations for IET " "node of type `%s`" % type(k)) for i in objs: try: if i.is_LocalObject: # On the stack site = v[-1] if v else iet allocator.push_stack(site, i) elif i.is_Array: if i._mem_external: # Nothing to do; e.g., a user-provided Function continue elif i._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], i) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = Transformer(mapper, nested=True).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def test_make_cpp_parfor(): """ Test construction of a CPP parallel for. This excites the IET construction machinery in several ways, in particular by using Lambda nodes (to generate C++ lambda functions) and nested Calls. """ class STDVectorThreads(LocalObject): dtype = type('std::vector<std::thread>', (c_void_p, ), {}) def __init__(self): self.name = 'threads' class STDThread(LocalObject): dtype = type('std::thread&', (c_void_p, ), {}) def __init__(self, name): self.name = name class FunctionType(LocalObject): dtype = type('FuncType&&', (c_void_p, ), {}) def __init__(self, name): self.name = name # Basic symbols nthreads = Symbol(name='nthreads', is_const=True) threshold = Symbol(name='threshold', is_const=True) last = Symbol(name='last', is_const=True) first = Symbol(name='first', is_const=True) portion = Symbol(name='portion', is_const=True) # Composite symbols threads = STDVectorThreads() # Iteration helper symbols begin = Symbol(name='begin') l = Symbol(name='l') end = Symbol(name='end') # Functions stdmax = sympy.Function('std::max') # Construct the parallel-for body func = FunctionType('func') i = Dimension(name='i') threadobj = Call( 'std::thread', Lambda( Iteration(Call(func.name, i), i, (begin, end - 1, 1)), ['=', Byref(func.name)], )) threadpush = Call(FieldFromComposite('push_back', threads), threadobj) it = Dimension(name='it') iteration = Iteration([ LocalExpression(DummyEq(begin, it)), LocalExpression(DummyEq(l, it + portion)), LocalExpression(DummyEq(end, InlineIf(l > last, last, l))), threadpush ], it, (first, last, portion)) thread = STDThread('x') waitcall = Call('std::for_each', [ Call(FieldFromComposite('begin', threads)), Call(FieldFromComposite('end', threads)), Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread]) ]) body = [ LocalExpression(DummyEq(threshold, 1)), LocalExpression( DummyEq(portion, stdmax(threshold, (last - first) / nthreads))), Call(FieldFromComposite('reserve', threads), nthreads), iteration, waitcall ] parfor = ElementalFunction('parallel_for', body, 'void', [first, last, func, nthreads]) assert str(parfor) == """\
def iet_insert_C_decls(iet, func_table=None): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: (Optional) a mapper from callable names within ``iet`` to :class:`Callable`s. """ func_table = func_table or {} allocator = Allocator() mapper = OrderedDict() # First, schedule declarations for Expressions scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table.get(k.name) if func is not None and func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write is None or k.write._mem_external: # Nothing to do, e.g., variable passed as kernel argument continue elif k.write._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Then, schedule declarations callables arguments passed by reference/pointer # (as modified internally by the callable) scopes = [(k, v) for k, v in me.visit(iet).items() if k.is_Call] for k, v in scopes: site = v[-1] if v else iet for i in k.params: try: if i.is_LocalObject: # On the stack allocator.push_stack(site, i) elif i.is_Array: if i._mem_stack: # On the stack allocator.push_stack(site, i) elif i._mem_heap: # On the heap allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = NestedTransformer(mapper).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall(Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet