class Intel64Rewriter(CPU64Rewriter): lang_intel_common = { 'ignore-deps': cgen.Pragma('ivdep'), 'ntstores': cgen.Pragma('vector nontemporal'), 'storefence': cgen.Statement('_mm_sfence()'), 'noinline': cgen.Pragma('noinline') } lang = { 'IntelCompiler': lang_intel_common, 'IntelKNLCompiler': lang_intel_common } """ Collection of backend-compiler-specific pragmas. """ @dle_pass def _avoid_denormals(self, iet): header = [ cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement( '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)') ] iet = List(header=header, body=iet) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def execute_time_loop(self): statements = [] statements.append(self.grid.time_stepping) if self.pluto: statements.append( cgen.Block([ cgen.Pragma("scop"), self.grid.stress_loop, cgen.Pragma("endscop") ])) else: statements.append(self.grid.stress_loop) statements.append(self.grid.stress_bc) if self.pluto: statements.append( cgen.Block([ cgen.Pragma("scop"), self.grid.velocity_loop, cgen.Pragma("endscop") ])) else: statements.append(self.grid.velocity_loop) statements.append(self.grid.velocity_bc) output_step = self.grid.output_step if output_step: statements.append(output_step) result = cgen.For(cgen.InlineInitializer(cgen.Value("int", "_ti"), 0), "_ti < ntsteps", "_ti++", cgen.Block(statements)) return result
class OmpBB(PragmaLangBB): mapper = { # Misc 'name': 'OpenMP', 'header': 'omp.h', # Platform mapping AMDGPUX: None, NVIDIAX: None, # Runtime library 'init': None, 'thread-num': DefFunction('omp_get_thread_num'), 'num-devices': lambda args: DefFunction('omp_get_num_devices', args), 'set-device': lambda args: Call('omp_set_default_device', args), # Pragmas 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update'), 'map-enter-to': lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-device': lambda i, j: c.Pragma('omp target update to(%s%s)' % (i, j)), 'map-release': lambda i, j, k: c.Pragma('omp target exit data map(release: %s%s)%s' % (i, j, k)), 'map-exit-delete': lambda i, j, k: c.Pragma('omp target exit data map(delete: %s%s)%s' % (i, j, k)), 'memcpy-to-device': lambda i, j, k: Call('omp_target_memcpy', [i, j, k, 0, 0, DefFunction('omp_get_device_num'), DefFunction('omp_get_initial_device')]), 'memcpy-to-device-wait': lambda i, j, k, l: Call('omp_target_memcpy', [i, j, k, 0, 0, DefFunction('omp_get_device_num'), DefFunction('omp_get_initial_device')]), 'device-get': 'omp_get_default_device()', 'device-alloc': lambda i, j: 'omp_target_alloc(%s, %s)' % (i, j), 'device-free': lambda i, j: 'omp_target_free(%s, %s)' % (i, j) } mapper.update(CBB.mapper) Region = OmpRegion HostIteration = OmpIteration DeviceIteration = DeviceOmpIteration Prodder = ThreadedProdder
def emit_extern_global_variables(self, source_file): source_file.add(cgen.Pragma("""push_macro("_SFR_IO8")""")) source_file.add(cgen.Line("#undef _SFR_IO8")) source_file.add(cgen.Define("_SFR_IO8(x)", "x")) for name, definition in self.definitions: port = definition.port[0] assert port in ("A", "B", "C", "D", "E") pin = int(definition.port[1]) source_file.add( cgen.Statement(f"static kGPIO<PORT{port},{pin}> {name}")) source_file.add(cgen.Pragma("""pop_macro("_SFR_IO8")"""))
def _generate_lib_outer_loop(self): block = cgen.Block([self._components['LIB_KERNEL_CALL']]) i = self._components['LIB_PAIR_INDEX_0'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared += sx + ',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel default(none) shared(' + shared + ')') parallel_region = cgen.Block(( cgen.Value('int', '_thread_start'), cgen.Value('int', '_thread_end'), cgen.Line( 'get_thread_decomp((int)_N_LOCAL, &_thread_start, &_thread_end);' ), cgen.For('int ' + i + '= _thread_start', i + '< _thread_end', i + '++', block))) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma, parallel_region ]) self._components['LIB_OUTER_LOOP'] = loop
def _make_header(cls, **kwargs): construct = cls._make_construct(**kwargs) clauses = cls._make_clauses(**kwargs) header = ' '.join([construct] + clauses) return c.Pragma(header)
def _generate_lib_outer_loop(self): block = cgen.Block([self._components['LIB_KERNEL_GATHER'], self._components['LIB_INNER_LOOP'], self._components['LIB_KERNEL_SCATTER']]) i = self._components['LIB_PAIR_INDEX_0'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared+= sx+',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel for schedule(static) // default(shared) shared(' + shared + ')') if runtime.OMP_NUM_THREADS is None: pragma = cgen.Comment(pragma) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma, cgen.For('int ' + i + '=0', i + '<_N_LOCAL', i+'++', block) ]) self._components['LIB_OUTER_LOOP'] = loop
def _make_header(cls, **kwargs): kwargs.pop('pragmas', None) construct = cls._make_construct(**kwargs) clauses = cls._make_clauses(**kwargs) header = c.Pragma(' '.join([construct] + clauses)) return (header, ), kwargs
def execute_parallel_block(self): statements = [] if self.profiling: if self.numevents_papi > 0: statements += [self.grid.define_papi_events] statements.append( cgen.Statement( "opesci_papi_start_counters(numevents, events)")) else: statements.append(cgen.Value("float", "real_time")) statements.append(cgen.Value("float", "proc_time")) statements.append(cgen.Value("float", "mflops")) statements.append(cgen.Value("long long", "flpins")) statements.append( cgen.Statement( "opesci_flops(&real_time, &proc_time, &flpins, &mflops)" )) statements.append(self.grid.initialise) statements.append(self.execute_time_loop()) if self.profiling: if self.numevents_papi > 0: statements.append( cgen.Statement( "opesci_papi_read_counters(numevents, counters)")) statements.append(cgen.Pragma("omp critical")) statements.append(cgen.Block(self.grid.sum_papi_events())) else: statements.append( cgen.Statement( "opesci_flops(&real_time, &proc_time, &flpins, &mflops)" )) statements.append(cgen.Pragma("omp critical")) critical_block = [] critical_block.append( cgen.Assign("profiling->g_rtime", "fmax(profiling->g_rtime, real_time)")) critical_block.append( cgen.Assign("profiling->g_ptime", "fmax(profiling->g_ptime, proc_time)")) critical_block.append( cgen.Statement("profiling->g_mflops += mflops;")) statements.append(cgen.Block(critical_block)) return [cgen.Pragma("omp parallel"), cgen.Block(statements)]
def make_gpudirect(self, iet): mapper = {} for node in FindNodes((IsendCall, IrecvCall)).visit(iet): header = c.Pragma('omp target data use_device_ptr(%s)' % node.arguments[0].name) mapper[node] = Block(header=header, body=node) iet = Transformer(mapper).visit(iet) return iet, {}
class DeviceAccizer(DeviceOmpizer): lang = dict(DeviceOmpizer.__base__.lang) lang.update({ 'atomic': c.Pragma('acc atomic update'), 'map-enter-to': lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)), 'map-present': lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)), 'map-release': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), 'map-exit-delete': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), 'map-pointers': lambda i: c.Pragma('acc host_data use_device(%s)' % i) }) _Iteration = DeviceOpenACCIteration @classmethod def _map_present(cls, f): # TODO: currently this is unused, because we cannot yet distinguish between # "real" Arrays and Functions that "acts as Arrays", created by the compiler # to build support routines (e.g., the Sendrecv/Gather/Scatter MPI Callables). # We should only use "#pragma acc present" for *real* Arrays -- that is # temporaries that are born and die on the Device return cls.lang['map-present'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_delete(cls, f): return cls.lang['map-exit-delete'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_pointers(cls, functions): return cls.lang['map-pointers'](','.join(f.name for f in functions)) def _make_parallel(self, iet): iet, metadata = super(DeviceAccizer, self)._make_parallel(iet) metadata['includes'] = ['openacc.h'] return iet, metadata
def _make_header(cls, **kwargs): construct = cls._make_construct(**kwargs) clauses = cls._make_clauses(**kwargs) header = c.Pragma(' '.join([construct] + clauses)) # Extract the Iteration Properties properties = cls._process_properties(**kwargs) # Drop the unrecognised or unused kwargs kwargs = cls._process_kwargs(**kwargs) return (header, ), kwargs, properties
class BasicRewriter(AbstractRewriter): lang_intel_common = { 'ignore-deps': cgen.Pragma('ivdep'), 'ntstores': cgen.Pragma('vector nontemporal'), 'storefence': cgen.Statement('_mm_sfence()'), 'noinline': cgen.Pragma('noinline') } lang = { 'IntelCompiler': lang_intel_common, 'IntelKNLCompiler': lang_intel_common } """ Collection of backend-compiler-specific pragmas. """ def _pipeline(self, state): self._avoid_denormals(state) def _backend_compiler_pragma(self, name, default=None): key = configuration['compiler'].__class__.__name__ return self.lang.get(key, {}).get(name, default) @dle_pass def _avoid_denormals(self, iet): """ Introduce nodes in the Iteration/Expression tree that will expand to C macros telling the CPU to flush denormal numbers in hardware. Denormals are normally flushed when using SSE-based instruction sets, except when compiling shared objects. """ header = [ cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement( '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)') ] iet = List(header=header, body=iet) return (iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')})
def mpi_gpu_direct(iet, **kwargs): """ Modify MPI Callables to enable multiple GPUs performing GPU-Direct communication. """ mapper = {} for node in FindNodes((IsendCall, IrecvCall)).visit(iet): header = c.Pragma('omp target data use_device_ptr(%s)' % node.arguments[0].name) mapper[node] = Block(header=header, body=node) iet = Transformer(mapper).visit(iet) return iet, {}
def _generate_lib_outer_loop(self): block = cgen.Block([ self._components['LIB_KERNEL_GATHER'], self._components['LIB_INNER_LOOP'], self._components['LIB_KERNEL_SCATTER'] ]) cx = self._components['LIB_CELL_CX'] cy = self._components['LIB_CELL_CY'] cz = self._components['LIB_CELL_CZ'] ncx = self._components['N_CELL_X'] ncy = self._components['N_CELL_Y'] ncz = self._components['N_CELL_Z'] exec_count = self._components['EXEC_COUNT'] red_exec_count = '_' + exec_count npad = self._components['N_CELL_PAD'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared += sx + ',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel for default(none) reduction(+:' + \ red_exec_count +') schedule(dynamic) collapse(3) ' + \ 'shared(' + shared + ')') if runtime.OMP_NUM_THREADS is None: pragma = cgen.Comment(pragma) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), cgen.Line('INT64 ' + red_exec_count + ' = 0;'), pragma, # cellx loop cgen.For( 'INT64 ' + cx + '=' + npad, cx + '<' + ncx + '-' + npad, cx + '++', cgen.Block([ cgen.For( 'INT64 ' + cy + '=' + npad, cy + '<' + ncy + '-' + npad, cy + '++', cgen.Block((cgen.For('INT64 ' + cz + '=' + npad, cz + '<' + ncz + '-' + npad, cz + '++', block), ))), ])), cgen.Line('*' + exec_count + ' += ' + red_exec_count + ';') ]) self._components['LIB_OUTER_LOOP'] = loop
class DeviceAccizer(DeviceOmpizer): lang = { 'atomic': c.Pragma('acc atomic update'), 'map-enter-to': lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)), 'map-release': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), 'map-exit-delete': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), } _Iteration = DeviceOpenACCIteration def _make_parallel(self, iet): iet, metadata = super(DeviceAccizer, self)._make_parallel(iet) metadata['includes'] = ['openacc.h'] return iet, metadata
class Ompizer(object): COLLAPSE = 32 """Use a collapse clause if the number of available physical cores is greater than this threshold.""" lang = { 'for': c.Pragma('omp for schedule(static)'), 'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i), 'par-region': lambda i: c.Pragma('omp parallel %s' % i), 'par-for': c.Pragma('omp parallel for schedule(static)'), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update') } """ Shortcuts for the OpenMP language. """ def __init__(self, key): """ :param key: A function returning True if ``v`` can be parallelized, False otherwise. """ self.key = key def _pragma_for(self, root, candidates): # Heuristic: if at least two parallel loops are available and the # physical core count is greater than COLLAPSE, then omp-collapse them nparallel = len(candidates) if (psutil.cpu_count(logical=False) < Ompizer.COLLAPSE or nparallel < 2 or not IsPerfectIteration().visit(root)): return self.lang['for'] else: return self.lang['collapse'](nparallel) def _make_parallel_tree(self, root, candidates): """ Return a mapper to parallelize the :class:`Iteration`s within /root/. """ parallel = self._pragma_for(root, candidates) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = {i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_increment} handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,)) else: mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,)) return mapper def make_parallel(self, iet): """ Transform ``iet`` by decorating its parallel :class:`Iteration`s with suitable ``#pragma omp ...`` triggering thread-level parallelism. """ # Group sequences of loops that should go within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(iet): # Determine the number of consecutive parallelizable Iterations candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged mapper = OrderedDict() for group in groups.values(): private = [] for root, candidates in group.items(): mapper.update(self._make_parallel_tree(root, candidates)) # Track the thread-private and thread-shared variables private.extend([i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=self.lang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region return Transformer(mapper).visit(iet)
class DeviceOmpizer(Ompizer): COLLAPSE_NCORES = 1 """ Always collapse when possible. """ COLLAPSE_WORK = 1 """ Always collapse when possible. """ lang = dict(Ompizer.lang) lang.update({ 'map-enter-to': lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-release': lambda i, j: c.Pragma('omp target exit data map(release: %s%s)' % (i, j)), 'map-exit-delete': lambda i, j: c.Pragma('omp target exit data map(delete: %s%s)' % (i, j)), }) _Iteration = DeviceOpenMPIteration @classmethod def _map_data(cls, f): if f.is_Array: return f.symbolic_shape else: return tuple(f._C_get_field(FULL, d).size for d in f.dimensions) @classmethod def _map_to(cls, f): return cls.lang['map-enter-to'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_alloc(cls, f): return cls.lang['map-enter-alloc'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_present(cls, f): raise NotImplementedError @classmethod def _map_update(cls, f): return cls.lang['map-update'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_release(cls, f): return cls.lang['map-release'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_delete(cls, f): return cls.lang['map-exit-delete'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_pointers(cls, f): raise NotImplementedError def _make_threaded_prodders(self, partree): # no-op for now return partree def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas for GPU offloading. """ assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree # Create a ParallelTree body = self._Iteration(ncollapse=ncollapse, **root.args) partree = ParallelTree([], body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree): # no-op for now return partree def _make_guard(self, partree, *args): # no-op for now return partree def _make_nested_partree(self, partree): # no-op for now return partree
class Ompizer(object): lang = { 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update'), 'thread-num': DefFunction('omp_get_thread_num') } """ Shortcuts for the OpenMP language. """ _Region = OpenMPRegion _Iteration = OpenMPIteration def __init__(self, sregistry, options, key=None): """ Parameters ---------- sregistry : SymbolRegistry The symbol registry, to quickly access the special symbols that may appear in the IET (e.g., `sregistry.threadid`, `sregistry.nthreads`). options : dict The optimization options. Accepted: ['par-collapse-ncores', 'par-collapse-work', 'par-chunk-nonaffine', 'par-dynamic-work', 'par-nested'] * 'par-collapse-ncores': use a collapse clause if the number of available physical cores is greater than this threshold. * 'par-collapse-work': use a collapse clause if the trip count of the collapsable Iterations is statically known to exceed this threshold. * 'par-chunk-nonaffine': coefficient to adjust the chunk size in non-affine parallel Iterations. * 'par-dynamic-work': use dynamic scheduling if the operation count per iteration exceeds this threshold. Otherwise, use static scheduling. * 'par-nested': nested parallelism if the number of hyperthreads per core is greater than this threshold. key : callable, optional Return True if an Iteration can be parallelized, False otherwise. """ self.sregistry = sregistry self.collapse_ncores = options['par-collapse-ncores'] self.collapse_work = options['par-collapse-work'] self.chunk_nonaffine = options['par-chunk-nonaffine'] self.dynamic_work = options['par-dynamic-work'] self.nested = options['par-nested'] if key is not None: self.key = key else: self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorized @property def nthreads(self): return self.sregistry.nthreads @property def nthreads_nested(self): return self.sregistry.nthreads_nested @property def nthreads_nonaffine(self): return self.sregistry.nthreads_nonaffine @property def threadid(self): return self.sregistry.threadid def _find_collapsable(self, root, candidates): collapsable = [] if ncores() >= self.collapse_ncores: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n + 1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) return collapsable @classmethod def _make_tid(cls, tid): return c.Initializer(c.Value(tid._C_typedata, tid.name), cls.lang['thread-num']) def _make_reductions(self, partree, collapsed): if not any(i.is_ParallelAtomic for i in collapsed): return partree # Collect expressions inducing reductions exprs = FindNodes(Expression).visit(partree) exprs = [ i for i in exprs if i.is_Increment and not i.is_ForeignExpression ] reduction = [i.output for i in exprs] if (all(i.is_Affine for i in collapsed) or all(not i.is_Indexed for i in reduction)): # Introduce reduction clause mapper = {partree.root: partree.root._rebuild(reduction=reduction)} else: # Introduce one `omp atomic` pragma for each increment mapper = { i: List(header=self.lang['atomic'], body=i) for i in exprs } partree = Transformer(mapper).visit(partree) return partree def _make_threaded_prodders(self, partree): mapper = { i: ThreadedProdder(i) for i in FindNodes(Prodder).visit(partree) } partree = Transformer(mapper).visit(partree) return partree def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = OpenMPIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = OpenMPIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = OpenMPIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree, parrays): arrays = [i for i in FindSymbols().visit(partree) if i.is_Array] # Detect thread-private arrays on the heap and "map" them to shared # vector-expanded (one entry per thread) Arrays heap_private = [i for i in arrays if i._mem_heap and i._mem_local] heap_globals = [] for i in heap_private: if i in parrays: pi = parrays[i] else: pi = parrays.setdefault( i, PointerArray(name=self.sregistry.make_name(), dimensions=(self.threadid, ), array=i)) heap_globals.append(Dereference(i, pi)) if heap_globals: body = List(header=self._make_tid(self.threadid), body=heap_globals + [partree], footer=c.Line()) else: body = partree return OpenMPRegion(body, partree.nthreads) def _make_guard(self, partree, collapsed): # Do not enter the parallel region if the step increment is 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementations. Note that using an OpenMP `if` clause won't work cond = [ CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol) ] cond = Or(*cond) if cond != False: # noqa: `cond` may be a sympy.False which would be == False partree = List(body=[Conditional(cond, Return()), partree]) return partree def _make_nested_partree(self, partree): # Apply heuristic if nhyperthreads() <= self.nested: return partree # Note: there might be multiple sub-trees amenable to nested parallelism, # hence we loop over all of them # # for (i = ... ) // outer parallelism # for (j0 = ...) // first source of nested parallelism # ... # for (j1 = ...) // second source of nested parallelism # ... mapper = {} for tree in retrieve_iteration_tree(partree): outer = tree[:partree.ncollapsed] inner = tree[partree.ncollapsed:] # Heuristic: nested parallelism is applied only if the top nested # parallel Iteration iterates *within* the top outer parallel Iteration # (i.e., the outer is a loop over blocks, while the nested is a loop # within a block) candidates = [] for i in inner: if self.key(i) and any( is_integer(j.step - i.symbolic_size) for j in outer): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the # heuristic above, then we break, as the candidates must be # perfectly nested break if not candidates: continue # Introduce nested parallelism subroot, subpartree, _ = self._make_partree( candidates, self.nthreads_nested) mapper[subroot] = subpartree partree = Transformer(mapper).visit(partree) return partree def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) if root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree, parrays) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin)) ] for n in FindNodes(Dereference).visit(iet): args.extend([(n.array, True), n.parray]) return iet, {'args': args, 'includes': ['omp.h']} @iet_pass def make_parallel(self, iet): """ Create a new IET with shared-memory parallelism via OpenMP pragmas. """ return self._make_parallel(iet) @iet_pass def make_simd(self, iet, **kwargs): """ Create a new IET with SIMD parallelism via OpenMP pragmas. """ simd_reg_size = kwargs.pop('simd_reg_size') mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_Parallel] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Construct OpenMP SIMD pragma aligned = [ j for j in FindSymbols('symbolics').visit(candidate) if j.is_DiscreteFunction ] if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
import cpuinfo import numpy as np import cgen as c """ A dictionary to quickly access standard OpenMP pragmas """ omplang = { 'for': c.Pragma('omp for schedule(static)'), 'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i), 'par-region': c.Pragma('omp parallel'), 'par-for': c.Pragma('omp parallel for schedule(static)'), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)) } """ Compiler-specific language """ complang_ALL = { 'IntelCompiler': {'ignore-deps': c.Pragma('ivdep'), 'ntstores': c.Pragma('vector nontemporal'), 'storefence': c.Statement('_mm_sfence()'), 'noinline': c.Pragma('noinline')} } complang_ALL['IntelKNLCompiler'] = complang_ALL['IntelCompiler'] """ SIMD generic info """
import cpuinfo import numpy as np import cgen as c """ Compiler-specific language """ complang_ALL = { 'IntelCompiler': { 'ignore-deps': c.Pragma('ivdep'), 'ntstores': c.Pragma('vector nontemporal'), 'storefence': c.Statement('_mm_sfence()'), 'noinline': c.Pragma('noinline') } } complang_ALL['IntelKNLCompiler'] = complang_ALL['IntelCompiler'] """ SIMD generic info """ simdinfo = { # Sizes in bytes of a vector register 'sse': 16, 'see4_2': 16, 'avx': 32, 'avx2': 32, 'avx512f': 64 } def get_simd_flag(): """Retrieve the best SIMD flag on the current architecture."""
class Ompizer(object): NESTED = 2 """ Use nested parallelism if the number of hyperthreads per core is greater than this threshold. """ COLLAPSE_NCORES = 4 """ Use a collapse clause if the number of available physical cores is greater than this threshold. """ COLLAPSE_WORK = 100 """ Use a collapse clause if the trip count of the collapsable Iterations exceeds this threshold. Note however the trip count is rarely known at compilation time (e.g., this may happen when DefaultDimensions are used). """ CHUNKSIZE_NONAFFINE = 3 """ Coefficient to adjust the chunk size in parallelized non-affine Iterations. """ DYNAMIC_WORK = 10 """ Use dynamic scheduling if the operation count per iteration exceeds this threshold. Otherwise, use static scheduling. """ lang = { 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update') } """ Shortcuts for the OpenMP language. """ def __init__(self, key=None): """ Parameters ---------- key : callable, optional Return True if an Iteration can be parallelized, False otherwise. """ if key is not None: self.key = key else: def key(i): if i.uindices: # Iteration must be in OpenMP canonical form return False return i.is_ParallelRelaxed and not i.is_Vectorized self.key = key self.nthreads = NThreads(aliases='nthreads0') self.nthreads_nested = NThreadsNested(aliases='nthreads1') self.nthreads_nonaffine = NThreadsNonaffine(aliases='nthreads2') def _find_collapsable(self, root, candidates): collapsable = [] if ncores() >= self.COLLAPSE_NCORES: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n+1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) return collapsable def _make_reductions(self, partree, collapsed): if not partree.is_ParallelAtomic: return partree # Collect expressions inducing reductions exprs = FindNodes(Expression).visit(partree) exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression] reduction = [i.output for i in exprs] if (all(i.is_Affine for i in collapsed) or all(not i.is_Indexed for i in reduction)): # Introduce reduction clause mapper = {partree.root: partree.root._rebuild(reduction=reduction)} else: # Introduce one `omp atomic` pragma for each increment mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs} partree = Transformer(mapper).visit(partree) return partree def _make_threaded_prodders(self, partree): mapper = {i: ThreadedProdder(i) for i in FindNodes(Prodder).visit(partree)} partree = Transformer(mapper).visit(partree) return partree def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.DYNAMIC_WORK: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = ParallelIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = ParallelIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = ParallelIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads*self.CHUNKSIZE_NONAFFINE), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree): # Build the `omp-parallel` region private = [i for i in FindSymbols().visit(partree) if i.is_Array and i._mem_stack] private = sorted(set([i.name for i in private])) return ParallelRegion(partree, partree.nthreads, private) def _make_guard(self, partree, collapsed): # Do not enter the parallel region if the step increment is 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementations. Note that using an OpenMP `if` clause won't work cond = [CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)] cond = Or(*cond) if cond != False: # noqa: `cond` may be a sympy.False which would be == False partree = List(body=[Conditional(cond, Return()), partree]) return partree def _make_nested_partree(self, partree): # Apply heuristic if nhyperthreads() <= Ompizer.NESTED: return partree # Note: there might be multiple sub-trees amenable to nested parallelism, # hence we loop over all of them # # for (i = ... ) // outer parallelism # for (j0 = ...) // first source of nested parallelism # ... # for (j1 = ...) // second source of nested parallelism # ... mapper = {} for tree in retrieve_iteration_tree(partree): outer = tree[:partree.ncollapsed] inner = tree[partree.ncollapsed:] # Heuristic: nested parallelism is applied only if the top nested # parallel Iteration iterates *within* the top outer parallel Iteration # (i.e., the outer is a loop over blocks, while the nested is a loop # within a block) candidates = [] for i in inner: if any(is_integer(j.step - i.symbolic_size) for j in outer): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the # heuristic above, then we break, as the candidates must be # perfectly nested break if not candidates: continue # Introduce nested parallelism subroot, subpartree, _ = self._make_partree(candidates, self.nthreads_nested) mapper[subroot] = subpartree partree = Transformer(mapper).visit(partree) return partree def _make_parallel(self, iet): mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The used `nthreads` arguments args = [i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin))] return iet, {'args': args, 'includes': ['omp.h']} @iet_pass def make_parallel(self, iet): """ Create a new IET with shared-memory parallelism via OpenMP pragmas. """ return self._make_parallel(iet) @iet_pass def make_simd(self, iet, **kwargs): """ Create a new IET with SIMD parallelism via OpenMP pragmas. """ simd_reg_size = kwargs.pop('simd_reg_size') mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_Parallel] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Construct OpenMP SIMD pragma aligned = [j for j in FindSymbols('symbolics').visit(candidate) if j.is_DiscreteFunction] if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple(simd(','.join([j.name for j in aligned]), simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def __init__(self, is_header: bool): self.includes = [] self.objects = [] if is_header: self.add(cgen.Pragma("once"))
def _make_header(cls, nthreads, private): private = ('private(%s)' % ','.join(private)) if private else '' return c.Pragma('omp parallel num_threads(%s) %s' % (nthreads.name, private))
class DeviceOmpizer(Ompizer): lang = dict(Ompizer.lang) lang.update({ 'map-enter-to': lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-device': lambda i, j: c.Pragma('omp target update to(%s%s)' % (i, j)), 'map-release': lambda i, j: c.Pragma('omp target exit data map(release: %s%s)' % (i, j)), 'map-exit-delete': lambda i, j, k: c.Pragma('omp target exit data map(delete: %s%s)%s' % (i, j, k)), }) _Iteration = DeviceOpenMPIteration def __init__(self, sregistry, options, key=None): super().__init__(sregistry, options, key=key) self.gpu_fit = options['gpu-fit'] self.par_disabled = options['par-disabled'] @classmethod def _make_sections_from_imask(cls, f, imask): datasize = cls._map_data(f) if imask is None: imask = [FULL]*len(datasize) assert len(imask) == len(datasize) sections = [] for i, j in zip(imask, datasize): if i is FULL: start, size = 0, j else: try: start, size = i except TypeError: start, size = i, 1 start = ccode(start) sections.append('[%s:%s]' % (start, size)) return ''.join(sections) @classmethod def _map_data(cls, f): if f.is_Array: return f.symbolic_shape else: return tuple(f._C_get_field(FULL, d).size for d in f.dimensions) @classmethod def _map_to(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-enter-to'](f.name, sections) _map_to_wait = _map_to @classmethod def _map_alloc(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-enter-alloc'](f.name, sections) @classmethod def _map_present(cls, f, imask=None): return @classmethod def _map_update(cls, f): return cls.lang['map-update'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_update_host(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-update-host'](f.name, sections) _map_update_wait_host = _map_update_host @classmethod def _map_update_device(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-update-device'](f.name, sections) _map_update_wait_device = _map_update_device @classmethod def _map_release(cls, f): return cls.lang['map-release'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_delete(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) # This ugly condition is to avoid a copy-back when, due to # domain decomposition, the local size of a Function is 0, which # would cause a crash cond = ' if(%s)' % ' && '.join('(%s != 0)' % i for i in cls._map_data(f)) return cls.lang['map-exit-delete'](f.name, sections, cond) @classmethod def _map_pointers(cls, f): raise NotImplementedError def _make_threaded_prodders(self, partree): if isinstance(partree.root, DeviceOpenMPIteration): # no-op for now return partree else: return super()._make_threaded_prodders(partree) def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas for parallelism. In particular: * All parallel Iterations not *writing* to a host Function, that is a Function `f` such that ``is_on_device(f) == False`, are offloaded to the device. * The remaining ones, that is those writing to a host Function, are parallelized on the host. """ assert candidates root = candidates[0] if is_on_device(root, self.gpu_fit, only_writes=True): # The typical case: all written Functions are device Functions, that is # they're mapped in the device memory. Then we offload `root` to the device # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) body = self._Iteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse, **root.args) partree = ParallelTree([], body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed elif not self.par_disabled: # Resort to host parallelism return super()._make_partree(candidates, nthreads) else: return root, None, None def _make_parregion(self, partree, *args): if isinstance(partree.root, DeviceOpenMPIteration): # no-op for now return partree else: return super()._make_parregion(partree, *args) def _make_guard(self, parregion, *args): partrees = FindNodes(ParallelTree).visit(parregion) if any(isinstance(i.root, DeviceOpenMPIteration) for i in partrees): # no-op for now return parregion else: return super()._make_guard(parregion, *args) def _make_nested_partree(self, partree): if isinstance(partree.root, DeviceOpenMPIteration): # no-op for now return partree else: return super()._make_nested_partree(partree)
class Ompizer(object): COLLAPSE = 32 """Use a collapse clause if the number of available physical cores is greater than this threshold.""" lang = { 'for': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i), 'par-region': lambda nt, i: c.Pragma('omp parallel num_threads(%s) %s' % (nt, i)), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update') } """ Shortcuts for the OpenMP language. """ def __init__(self, key=None): """ Parameters ---------- key : callable, optional Return True if an Iteration can be parallelized, False otherwise. """ if key is not None: self.key = key else: self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorizable self.nthreads = NThreads(name='nthreads') def _ncollapse(self, root, candidates): # The OpenMP specification forbids collapsed loops to use iteration variables # in initializer expressions. For example, the following is forbidden: # # #pragma omp ... collapse(2) # for (int i = ... ) # for (int j = i ...) # ... # # Below, we make sure this won't happen for n, i in enumerate(candidates): if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break candidates = candidates[:n] # Heuristic: if at least two parallel loops are available and the # physical core count is greater than COLLAPSE, then omp-collapse them nparallel = len(candidates) isperfect = IsPerfectIteration().visit(root) if ncores() < Ompizer.COLLAPSE or nparallel < 2 or not isperfect: return 1 else: return nparallel def _make_parallel_tree(self, root, candidates): """Parallelize the IET rooted in `root`.""" ncollapse = self._ncollapse(root, candidates) parallel = self.lang['for'](ncollapse) pragmas = root.pragmas + (parallel, ) properties = root.properties + (COLLAPSED(ncollapse), ) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = { i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_Increment } handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=pragmas, properties=properties) else: mapper[root] = root._rebuild(pragmas=pragmas, properties=properties) root = Transformer(mapper).visit(root) return root def make_parallel(self, iet): """Transform ``iet`` by introducing shared-memory parallelism.""" mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the first omp-parallelizable Iteration in `tree` candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: continue root = candidates[0] # Build the `omp-for` tree partree = self._make_parallel_tree(root, candidates) # Find out the thread-private and thread-shared variables private = [ i for i in FindSymbols().visit(partree) if i.is_Array and i._mem_stack ] # Build the `omp-parallel` region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' partree = Block(header=self.lang['par-region'](self.nthreads.name, private), body=partree) # Do not enter the parallel region if the step increment might be 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementation. Note that using an OpenMP `if` clause won't work if isinstance(root.step, Symbol): cond = Conditional(CondEq(root.step, 0), Element(c.Statement('return'))) partree = List(body=[cond, partree]) mapper[root] = partree iet = Transformer(mapper).visit(iet) return iet, {'input': [self.nthreads] if mapper else []}
class Ompizer(object): NESTED = 2 """ Use nested parallelism if the number of hyperthreads per core is greater than this threshold. """ COLLAPSE = 32 """ Use a collapse clause if the number of available physical cores is greater than this threshold. """ lang = { 'for': lambda i: c.Pragma('omp for collapse(%d) schedule(static,1)' % i), 'par-for': lambda i, j: c.Pragma('omp parallel for collapse(%d) ' 'schedule(static,1) num_threads(%d)' % (i, j)), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update') } """ Shortcuts for the OpenMP language. """ def __init__(self, key=None): """ Parameters ---------- key : callable, optional Return True if an Iteration can be parallelized, False otherwise. """ if key is not None: self.key = key else: self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorizable self.nthreads = NThreads(name='nthreads') def _make_atomic_incs(self, partree): if not partree.is_ParallelAtomic: return partree # Introduce one `omp atomic` pragma for each increment exprs = FindNodes(Expression).visit(partree) exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression] mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs} partree = Transformer(mapper).visit(partree) return partree def _make_atomic_prodders(self, partree): # Atomic-ize any single-thread Prodders in the parallel tree mapper = {i: SingleThreadProdder(i) for i in FindNodes(Prodder).visit(partree)} partree = Transformer(mapper).visit(partree) return partree def _make_partree(self, candidates, omp_pragma): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE and IsPerfectIteration().visit(root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break collapsable.append(i) # Attach an OpenMP pragma-for with a collapse clause ncollapse = 1 + len(collapsable) partree = root._rebuild(pragmas=root.pragmas + (omp_pragma(ncollapse),), properties=root.properties + (COLLAPSED(ncollapse),)) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree): # Build the `omp-parallel` region private = [i for i in FindSymbols().visit(partree) if i.is_Array and i._mem_stack] private = sorted(set([i.name for i in private])) return ParallelRegion(partree, self.nthreads, private) def _make_guard(self, partree, collapsed): # Do not enter the parallel region if the step increment is 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementations. Note that using an OpenMP `if` clause won't work cond = [CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)] cond = Or(*cond) if cond != False: # noqa: `cond` may be a sympy.False which would be == False partree = List(body=[Conditional(cond, Return()), partree]) return partree def _make_nested_partree(self, partree): # Apply heuristic if nhyperthreads() <= Ompizer.NESTED: return partree # Note: there might be multiple sub-trees amenable to nested parallelism, # hence we loop over all of them # # for (i = ... ) // outer parallelism # for (j0 = ...) // first source of nested parallelism # ... # for (j1 = ...) // second source of nested parallelism # ... mapper = {} for tree in retrieve_iteration_tree(partree): index = tree.index(partree) outer = tree[index:index + partree.ncollapsed] inner = tree[index + partree.ncollapsed:] # Heuristic: nested parallelism is applied only if the top nested # parallel Iteration iterates *within* the top outer parallel Iteration # (i.e., the outer is a loop over blocks, while the nested is a loop # within a block) candidates = [] for i in inner: if any(is_integer(j.step - i.symbolic_size) for j in outer): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the # heuristic above, then we break, as the candidates must be # perfectly nested break if not candidates: continue # Introduce nested parallelism omp_pragma = lambda i: self.lang['par-for'](i, nhyperthreads()) subroot, subpartree, _ = self._make_partree(candidates, omp_pragma) mapper[subroot] = subpartree partree = Transformer(mapper).visit(partree) return partree def make_parallel(self, iet): """Transform ``iet`` by introducing shared-memory parallelism.""" mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates, self.lang['for']) # Nested parallelism partree = self._make_nested_partree(partree) # Ensure increments are atomic partree = self._make_atomic_incs(partree) # Ensure single-thread prodders are atomic partree = self._make_atomic_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) return iet, {'args': [self.nthreads] if mapper else [], 'includes': ['omp.h']}
class DeviceAccizer(DeviceOmpizer): lang = dict(DeviceOmpizer.__base__.lang) lang.update({ 'atomic': c.Pragma('acc atomic update'), 'map-enter-to': lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)), 'map-enter-to-wait': lambda i, j, k: (c.Pragma('acc enter data copyin(%s%s) async(%s)' % (i, j, k)), c.Pragma('acc wait(%s)' % k)), 'map-enter-alloc': lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)), 'map-present': lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('acc update self(%s%s)' % (i, j)), 'map-update-wait-host': lambda i, j, k: (c.Pragma('acc update self(%s%s) async(%s)' % (i, j, k)), c.Pragma('acc wait(%s)' % k)), 'map-update-device': lambda i, j: c.Pragma('acc update device(%s%s)' % (i, j)), 'map-update-wait-device': lambda i, j, k: (c.Pragma('acc update device(%s%s) async(%s)' % (i, j, k)), c.Pragma('acc wait(%s)' % k)), 'map-release': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), 'map-exit-delete': lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)), 'map-pointers': lambda i: c.Pragma('acc host_data use_device(%s)' % i) }) _Iteration = DeviceOpenACCIteration @classmethod def _map_to_wait(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-enter-to-wait'](f.name, sections, queueid) @classmethod def _map_present(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-present'](f.name, sections) @classmethod def _map_delete(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-exit-delete'](f.name, sections) @classmethod def _map_update_wait_host(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-update-wait-host'](f.name, sections, queueid) @classmethod def _map_update_wait_device(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.lang['map-update-wait-device'](f.name, sections, queueid) @classmethod def _map_pointers(cls, functions): return cls.lang['map-pointers'](','.join(f.name for f in functions)) def _make_parallel(self, iet): iet, metadata = super(DeviceAccizer, self)._make_parallel(iet) metadata['includes'] = ['openacc.h'] return iet, metadata
class OffloadingOmpizer(Ompizer): COLLAPSE_NCORES = 1 """ Always collapse when possible. """ COLLAPSE_WORK = 1 """ Always collapse when possible. """ lang = dict(Ompizer.lang) lang.update({ 'par-for-teams': lambda i: c.Pragma( 'omp target teams distribute parallel for collapse(%d)' % i), 'map-enter-to': lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), 'map-exit-from': lambda i, j: c.Pragma('omp target exit data map(from: %s%s)' % (i, j)), 'map-exit-delete': lambda i, j: c.Pragma('omp target exit data map(delete: %s%s)' % (i, j)), }) def __init__(self, key=None): if key is None: key = lambda i: i.is_ParallelRelaxed super(OffloadingOmpizer, self).__init__(key=key) @classmethod def _map_data(cls, f): if f.is_Array: return f.symbolic_shape else: return tuple(f._C_get_field(FULL, d).size for d in f.dimensions) @classmethod def _map_to(cls, f): return cls.lang['map-enter-to'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_alloc(cls, f): return cls.lang['map-enter-alloc'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_from(cls, f): return cls.lang['map-exit-from'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) @classmethod def _map_delete(cls, f): return cls.lang['map-exit-delete'](f.name, ''.join('[0:%s]' % i for i in cls._map_data(f))) def _make_threaded_prodders(self, partree): # no-op for now return partree def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas for GPU offloading. """ assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree omp_pragma = self.lang['par-for-teams'](ncollapse) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree([], body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree): # no-op for now return partree def _make_guard(self, partree, *args): # no-op for now return partree def _make_nested_partree(self, partree): # no-op for now return partree
def _cgen(self): return cgen.Pragma(self.value)