def _emit_build_profiling(self): if not is_log_enabled_for('PERF'): return # Rounder to K decimal places fround = lambda i, n=100: ceil(i * n) / n timings = self._profiler.py_timers.copy() tot = timings.pop('op-compile') perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot))) max_hotspots = 3 threshold = 20. def _emit_timings(timings, indent=''): timings.pop('total', None) entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True) for i in entries[:max_hotspots]: v = fround(timings[i]['total']) perc = fround(v / tot * 100, n=10) if perc > threshold: perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc)) _emit_timings(timings[i], ' ' * len(indent) + ' * ') _emit_timings(timings, ' * ') if self._profiler._ops: ops = ['%d --> %d' % i for i in self._profiler._ops] perf("Flops reduction after symbolic optimization: [%s]" % ' ; '.join(ops))
def _emit_timings(timings, indent=''): timings.pop('total', None) entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True) for i in entries[:max_hotspots]: v = fround(timings[i]['total']) perc = fround(v/tot*100, n=10) if perc > threshold: perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc)) _emit_timings(timings[i], ' '*len(indent) + ' * ')
def _profile_output(self, args): """Produce a performance summary of the profiled sections.""" summary = self._profiler.summary(args, self._dtype) info("Operator `%s` run in %.2f s" % (self.name, sum(summary.timings.values()))) for k, v in summary.items(): itershapes = [",".join(str(i) for i in its) for its in v.itershapes] if len(itershapes) > 1: name = "%s<%s>" % (k, ",".join("<%s>" % i for i in itershapes)) elif len(itershapes) == 1: name = "%s<%s>" % (k, itershapes[0]) else: name = None gpointss = ", %.2f GPts/s" % v.gpointss if v.gpointss else '' perf("* %s with OI=%.2f computed in %.3f s [%.2f GFlops/s%s]" % (name, v.oi, v.time, v.gflopss, gpointss)) return summary
def _jit_compile(self): """ JIT-compile the C code generated by the Operator. It is ensured that JIT compilation will only be performed once per Operator, reagardless of how many times this method is invoked. """ if self._lib is None: with self._profiler.timer_on('jit-compile'): recompiled, src_file = self._compiler.jit_compile( self._soname, str(self.ccode)) elapsed = self._profiler.py_timers['jit-compile'] if recompiled: perf("Operator `%s` jit-compiled `%s` in %.2f s with `%s`" % (self.name, src_file, elapsed, self._compiler)) else: perf("Operator `%s` fetched `%s` in %.2f s from jit-cache" % (self.name, src_file, elapsed))
def _emit_build_profiling(self): if not is_log_enabled_for('PERF'): return # Rounder to K decimal places fround = lambda i, n=100: ceil(i * n) / n timings = self._profiler.py_timers.copy() tot = timings.pop('op-compile') perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot))) max_hotspots = 3 for i in sorted(timings, key=timings.get, reverse=True)[:max_hotspots]: v = fround(timings[i]) perc = fround(v / tot * 100, n=10) if perc > 20.: perf("- [Hotspot] %s: %.2f s (%.1f %%)" % (i.lstrip('_'), v, perc))
def autotune(operator, arguments, parameters, tunable): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ # We get passed all the arguments, but the cfunction only requires a subset at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters]) # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() iterations = FindNodes(Iteration).visit(operator.body) dim_mapper = {i.dim.name: i.dim for i in iterations} # Shrink the iteration space of time-stepping dimension so that auto-tuner # runs will finish quickly steppers = [i for i in iterations if i.dim.is_Time] if len(steppers) == 0: timesteps = 1 elif len(steppers) == 1: stepper = steppers[0] start = at_arguments[stepper.dim.min_name] timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1 if timesteps < 0: timesteps = options['at_squeezer'] - timesteps perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps) at_arguments[stepper.dim.min_name] = start at_arguments[stepper.dim.max_name] = timesteps if stepper.dim.is_Stepping: at_arguments[stepper.dim.parent.min_name] = start at_arguments[stepper.dim.parent.max_name] = timesteps else: warning("AutoTuner: Couldn't understand loop structure; giving up") return arguments # Attempted block sizes ... mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) # ... Defaults (basic mode) blocksizes = [ OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize'] ] # ... Always try the entire iteration space (degenerate block) itershape = [ mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper ] blocksizes.append( OrderedDict([(i, mapper[i].iteration.extent(0, j - 1)) for i, j in zip(mapper, itershape)])) # ... More attempts if auto-tuning in aggressive mode if configuration['autotuning'].level == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # How many temporaries are allocated on the stack? # Will drop block sizes that might lead to a stack overflow functions = FindSymbols('symbolics').visit(operator.body + operator.elemental_functions) stack_shapes = [ i.symbolic_shape for i in functions if i.is_Array and i._mem_stack ] stack_space = sum(reduce(mul, i, 1) for i in stack_shapes) * operator._dtype().itemsize # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for bs in blocksizes: illegal = False for k, v in at_arguments.items(): if k in bs: val = bs[k] start = mapper[k].original_dim.symbolic_start.subs(arguments) end = mapper[k].original_dim.symbolic_end.subs(arguments) if val <= mapper[k].iteration.extent(start, end): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break if illegal: continue # Make sure we remain within stack bounds, otherwise skip block size dim_sizes = {} for k, v in at_arguments.items(): if k in bs: dim_sizes[mapper[k].argument.symbolic_size] = bs[k] elif k in dim_mapper: dim_sizes[dim_mapper[k].symbolic_size] = v try: bs_stack_space = stack_space.xreplace(dim_sizes) except AttributeError: bs_stack_space = stack_space try: if int(bs_stack_space) > options['at_stack_limit']: continue except TypeError: # We should never get here warning( "AutoTuner: Couldn't determine stack size; skipping block shape %s" % str(bs)) continue # Use AutoTuner-specific profiler structs timer = operator.profiler.timer.reset() at_arguments[operator.profiler.name] = timer operator.cfunction(*list(at_arguments.values())) elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_) timings[tuple(bs.items())] = elapsed perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) try: best = dict(min(timings, key=timings.get)) perf("AutoTuner: Selected block shape %s" % best) except ValueError: warning("AutoTuner: Couldn't find legal block shapes") return arguments # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v # Reset the profiling struct assert operator.profiler.name in tuned tuned[operator.profiler.name] = operator.profiler.timer.reset() return tuned
def log(msg): perf("AutoTuner: %s" % msg)
def _emit_apply_profiling(self, args): """Produce a performance summary of the profiled sections.""" # Rounder to 2 decimal places fround = lambda i: ceil(i * 100) / 100 info("Operator `%s` ran in %.2f s" % (self.name, fround(self._profiler.py_timers['apply']))) summary = self._profiler.summary(args, self._dtype, reduce_over='apply') if not is_log_enabled_for('PERF'): # Do not waste time return summary if summary.globals: # Note that with MPI enabled, the global performance indicators # represent "cross-rank" performance data metrics = [] v = summary.globals.get('vanilla') if v is not None: metrics.append("OI=%.2f" % fround(v.oi)) metrics.append("%.2f GFlops/s" % fround(v.gflopss)) v = summary.globals.get('fdlike') if v is not None: metrics.append("%.2f GPts/s" % fround(v.gpointss)) if metrics: perf("Global performance: [%s]" % ', '.join(metrics)) perf("Local performance:") indent = " " * 2 else: indent = "" # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" oi = "OI=%.2f" % fround(v.oi) gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [oi, gflopss, gpointss] if i is not None) itershapes = [ ",".join(str(i) for i in its) for its in v.itershapes ] if len(itershapes) > 1: itershapes = ",".join("<%s>" % i for i in itershapes) elif len(itershapes) == 1: itershapes = itershapes[0] else: itershapes = "" name = "%s%s<%s>" % (k.name, rank, itershapes) perf("%s* %s ran in %.2f s [%s]" % (indent, name, fround(v.time), metrics)) for n, time in summary.subsections.get(k.name, {}).items(): perf("%s+ %s ran in %.2f s [%.2f%%]" % (indent * 2, n, time, fround(time / v.time * 100))) # Emit performance mode and arguments perf_args = {} for i in self.input + self.dimensions: if not i.is_PerfKnob: continue try: perf_args[i.name] = args[i.name] except KeyError: # Try with the aliases for a in i._arg_names: if a in args: perf_args[a] = args[a] break perf("Performance[mode=%s] arguments: %s" % (self._state['optimizations'], perf_args)) return summary
def _emit_apply_profiling(self, args): """Produce a performance summary of the profiled sections.""" # Rounder to 2 decimal places fround = lambda i: ceil(i * 100) / 100 info("Operator `%s` run in %.2f s" % (self.name, fround(self._profiler.py_timers['apply']))) summary = self._profiler.summary(args, self._dtype, reduce_over='apply') if not is_log_enabled_for('PERF'): # Do not waste time return summary if summary.globals: indent = " " * 2 perf("Global performance indicators") # With MPI enabled, the 'vanilla' entry contains "cross-rank" performance data v = summary.globals.get('vanilla') if v is not None: gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) perf( "%s* Operator `%s` with OI=%.2f computed in %.2f s [%s]" % (indent, self.name, fround(v.oi), fround(v.time), metrics)) v = summary.globals.get('fdlike') if v is not None: perf("%s* Achieved %.2f FD-GPts/s" % (indent, v.gpointss)) perf("Local performance indicators") else: indent = "" # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) itershapes = [ ",".join(str(i) for i in its) for its in v.itershapes ] if len(itershapes) > 1: name = "%s%s<%s>" % (k.name, rank, ",".join( "<%s>" % i for i in itershapes)) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) elif len(itershapes) == 1: name = "%s%s<%s>" % (k.name, rank, itershapes[0]) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) else: name = k.name perf("%s* %s%s computed in %.2f s" % (indent, name, rank, fround(v.time))) # Emit relevant configuration values perf("Configuration: %s" % self._state['optimizations']) # Emit relevant performance arguments perf_args = {} for i in self.input + self.dimensions: if not i.is_PerfKnob: continue try: perf_args[i.name] = args[i.name] except KeyError: # Try with the aliases for a in i._arg_names: if a in args: perf_args[a] = args[a] break perf("Performance arguments: %s" % perf_args) return summary
def _profile_output(self, args): """Produce a performance summary of the profiled sections.""" # Rounder to 2 decimal places fround = lambda i: ceil(i * 100) / 100 info("Operator `%s` run in %.2f s" % (self.name, fround(self._profiler.py_timers['apply']))) summary = self._profiler.summary(args, self._dtype, reduce_over='apply') if summary.globals: indent = " " * 2 perf("Global performance indicators") # With MPI enabled, the 'vanilla' entry contains "cross-rank" performance data v = summary.globals.get('vanilla') if v is not None: gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) perf( "%s* Operator `%s` with OI=%.2f computed in %.2f s [%s]" % (indent, self.name, fround(v.oi), fround(v.time), metrics)) v = summary.globals.get('fdlike') if v is not None: perf("%s* Achieved %.2f FD-GPts/s" % (indent, v.gpointss)) perf("Local performance indicators") else: indent = "" # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) itershapes = [ ",".join(str(i) for i in its) for its in v.itershapes ] if len(itershapes) > 1: name = "%s%s<%s>" % (k.name, rank, ",".join( "<%s>" % i for i in itershapes)) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) elif len(itershapes) == 1: name = "%s%s<%s>" % (k.name, rank, itershapes[0]) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) else: name = k.name perf("%s* %s%s computed in %.2f s" % (indent, name, rank, fround(v.time))) perf("Configuration: %s" % self._state['optimizations']) return summary