class ECMData(PerformanceModel): """Representation of Data portion of the Execution-Cache-Memory Model.""" name = "Execution-Cache-Memory (data transfers only)" def __init__(self, kernel, machine, args=None, parser=None, cores=1, cache_predictor=CacheSimulationPredictor, verbose=0): """ Create Execcution-Cache-Memory data model from kernel and machine objects. *kernel* is a Kernel object *machine* describes the machine (cpu, cache and memory) characteristics *args* (optional) are the parsed arguments from the comand line If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account, otherwise *args* takes precedence. """ self.kernel = kernel self.machine = machine self._args = args self._parser = parser self.results = {} if args: self.verbose = self._args.verbose self.cores = self._args.cores if self._args.cache_predictor == 'SIM': self.predictor = CacheSimulationPredictor( self.kernel, self.machine, self.cores) elif self._args.cache_predictor == 'LC': self.predictor = LayerConditionPredictor( self.kernel, self.machine, self.cores) else: raise NotImplementedError( "Unknown cache predictor, only LC (layer condition) and " "SIM (cache simulation with pycachesim) is supported.") else: self.cores = cores self.predictor = cache_predictor(self.kernel, self.machine, self.cores) self.verbose = verbose def calculate_cache_access(self): """Dispatch to cache predictor to get cache stats.""" self.results.update({ 'cycles': [], # will be filled by caclculate_cycles() 'misses': self.predictor.get_misses(), 'hits': self.predictor.get_hits(), 'evicts': self.predictor.get_evicts(), 'verbose infos': self.predictor.get_infos() }) # only for verbose outputs def calculate_cycles(self): """ Calculate performance model cycles from cache stats. calculate_cache_access() needs to have been execute before. """ element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = float( self.machine['cacheline size']) // element_size cacheline_size = float(self.machine['cacheline size']) loads, stores = (self.predictor.get_loads(), self.predictor.get_stores()) for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[1:]: throughput, duplexness = cache_info[ 'non-overlap upstream throughput'] if type(throughput ) is str and throughput == 'full socket memory bandwidth': # Memory transfer # we use bandwidth to calculate cycles and then add panalty cycles (if given) # choose bw according to cache level and problem # first, compile stream counts at current cache level # write-allocate is allready resolved in cache predictor read_streams = loads[cache_level] write_streams = stores[cache_level] # second, try to find best fitting kernel (closest to stream seen stream counts): threads_per_core = 1 bw, measurement_kernel = self.machine.get_bandwidth( cache_level, read_streams, write_streams, threads_per_core) # calculate cycles if duplexness == 'half-duplex': cycles = float(loads[cache_level] + stores[cache_level]) * \ float(elements_per_cacheline) * float(element_size) * \ float(self.machine['clock']) / float(bw) else: # full-duplex raise NotImplementedError( "full-duplex mode is not (yet) supported for memory transfers." ) # add penalty cycles for each read stream if 'penalty cycles per read stream' in cache_info: cycles += stores[cache_level] * \ cache_info['penalty cycles per read stream'] self.results.update({ 'memory bandwidth kernel': measurement_kernel, 'memory bandwidth': bw }) else: # since throughput is given in B/cy, and we need CL/cy: throughput = float(throughput) / cacheline_size # only cache cycles count if duplexness == 'half-duplex': cycles = (loads[cache_level] + stores[cache_level]) / float(throughput) elif duplexness == 'full-duplex': cycles = max(loads[cache_level] / float(throughput), stores[cache_level] / float(throughput)) else: raise ValueError( "Duplexness of cache throughput may only be 'half-duplex'" "or 'full-duplex', found {} in {}.".format( duplexness, cache_info['name'])) self.results['cycles'].append((cache_info['level'], cycles)) self.results[cache_info['level']] = cycles return self.results def analyze(self): """Run complete anaylysis and return results.""" self.calculate_cache_access() self.calculate_cycles() self.results['flops per iteration'] = sum(self.kernel._flops.values()) return self.results def conv_cy(self, cy_cl): """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s.""" if not isinstance(cy_cl, PrefixedUnit): cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL') clock = self.machine['clock'] element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = int( self.machine['cacheline size']) // element_size if cy_cl != 0: it_s = clock / cy_cl * elements_per_cacheline it_s.unit = 'It/s' else: it_s = PrefixedUnit('inf It/S') flops_per_it = sum(self.kernel._flops.values()) performance = it_s * flops_per_it performance.unit = 'FLOP/s' cy_it = cy_cl * elements_per_cacheline cy_it.unit = 'cy/It' return { 'It/s': it_s, 'cy/CL': cy_cl, 'cy/It': cy_it, 'FLOP/s': performance } def report_data_transfers(self): cacheline_size = float(self.machine['cacheline size']) r = "Data Transfers:\nLevel | Loads | Store |\n" loads, stores = (self.predictor.get_loads(), self.predictor.get_stores()) for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[1:]: r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format( self.machine['memory hierarchy'][cache_level - 1]['level'] + '-' + cache_info['level'], loads[cache_level] * cacheline_size, stores[cache_level] * cacheline_size)) return r def report(self, output_file=sys.stdout): """Print generated model data in human readable format.""" if self.verbose > 1: print('{}'.format(pprint.pformat(self.results['verbose infos'])), file=output_file) for level, cycles in self.results['cycles']: print('{} = {}'.format(level, self.conv_cy(cycles)[self._args.unit]), file=output_file) if self.verbose > 1: if 'memory bandwidth kernel' in self.results: print('memory cycles based on {} kernel with {}'.format( self.results['memory bandwidth kernel'], self.results['memory bandwidth']), file=output_file) if self.verbose > 1: print(file=output_file) print(self.report_data_transfers(), file=output_file)
class ECMData(PerformanceModel): """Representation of Data portion of the Execution-Cache-Memory Model.""" name = "Execution-Cache-Memory (data transfers only)" def __init__(self, kernel, machine, args=None, parser=None, cores=1, cache_predictor=CacheSimulationPredictor, verbose=0): """ Create Execcution-Cache-Memory data model from kernel and machine objects. *kernel* is a Kernel object *machine* describes the machine (cpu, cache and memory) characteristics *args* (optional) are the parsed arguments from the comand line If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account, otherwise *args* takes precedence. """ self.kernel = kernel self.machine = machine self._args = args self._parser = parser self.results = {} if args: self.verbose = self._args.verbose self.cores = self._args.cores if self._args.cache_predictor == 'SIM': self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self.cores) elif self._args.cache_predictor == 'LC': self.predictor = LayerConditionPredictor(self.kernel, self.machine, self.cores) else: raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and " "SIM (cache simulation with pycachesim) is supported.") else: self.cores = cores self.predictor = cache_predictor(self.kernel, self.machine, self.cores) self.verbose = verbose def calculate_cache_access(self): """Dispatch to cache predictor to get cache stats.""" self.results.update({ 'cycles': [], # will be filled by caclculate_cycles() 'misses': self.predictor.get_misses(), 'hits': self.predictor.get_hits(), 'evicts': self.predictor.get_evicts(), 'verbose infos': self.predictor.get_infos()}) # only for verbose outputs def calculate_cycles(self): """ Calculate performance model cycles from cache stats. calculate_cache_access() needs to have been execute before. """ element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = float(self.machine['cacheline size']) // element_size iterations_per_cacheline = (sympy.Integer(self.machine['cacheline size']) / sympy.Integer(self.kernel.bytes_per_iteration)) self.results['iterations per cacheline'] = iterations_per_cacheline cacheline_size = float(self.machine['cacheline size']) loads, stores = (self.predictor.get_loads(), self.predictor.get_stores()) for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]: throughput, duplexness = cache_info['non-overlap upstream throughput'] if type(throughput) is str and throughput == 'full socket memory bandwidth': # Memory transfer # we use bandwidth to calculate cycles and then add panalty cycles (if given) # choose bw according to cache level and problem # first, compile stream counts at current cache level # write-allocate is allready resolved in cache predictor read_streams = loads[cache_level] write_streams = stores[cache_level] # second, try to find best fitting kernel (closest to stream seen stream counts): threads_per_core = 1 bw, measurement_kernel = self.machine.get_bandwidth( cache_level, read_streams, write_streams, threads_per_core) # calculate cycles if duplexness == 'half-duplex': cycles = float(loads[cache_level] + stores[cache_level]) * \ float(elements_per_cacheline) * float(element_size) * \ float(self.machine['clock']) / float(bw) else: # full-duplex raise NotImplementedError( "full-duplex mode is not (yet) supported for memory transfers.") # add penalty cycles for each read stream if 'penalty cycles per read stream' in cache_info: cycles += stores[cache_level] * \ cache_info['penalty cycles per read stream'] self.results.update({ 'memory bandwidth kernel': measurement_kernel, 'memory bandwidth': bw}) else: # since throughput is given in B/cy, and we need CL/cy: throughput = float(throughput) / cacheline_size # only cache cycles count if duplexness == 'half-duplex': cycles = (loads[cache_level] + stores[cache_level]) / float(throughput) elif duplexness == 'full-duplex': cycles = max(loads[cache_level] / float(throughput), stores[cache_level] / float(throughput)) else: raise ValueError("Duplexness of cache throughput may only be 'half-duplex'" "or 'full-duplex', found {} in {}.".format( duplexness, cache_info['name'])) self.results['cycles'].append((cache_info['level'], cycles)) self.results[cache_info['level']] = cycles return self.results def analyze(self): """Run complete anaylysis and return results.""" self.calculate_cache_access() self.calculate_cycles() self.results['flops per iteration'] = sum(self.kernel._flops.values()) return self.results def conv_cy(self, cy_cl): """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s.""" if not isinstance(cy_cl, PrefixedUnit): cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL') clock = self.machine['clock'] element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = int(self.machine['cacheline size']) // element_size if cy_cl != 0: it_s = clock/cy_cl*elements_per_cacheline it_s.unit = 'It/s' else: it_s = PrefixedUnit('inf It/S') flops_per_it = sum(self.kernel._flops.values()) performance = it_s*flops_per_it performance.unit = 'FLOP/s' cy_it = cy_cl*elements_per_cacheline cy_it.unit = 'cy/It' return {'It/s': it_s, 'cy/CL': cy_cl, 'cy/It': cy_it, 'FLOP/s': performance} def report_data_transfers(self): cacheline_size = float(self.machine['cacheline size']) r = "Data Transfers:\nLevel | Loads | Store |\n" loads, stores = (self.predictor.get_loads(), self.predictor.get_stores()) for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]: r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format( self.machine['memory hierarchy'][cache_level-1]['level']+'-'+cache_info['level'], loads[cache_level] * cacheline_size, stores[cache_level] * cacheline_size)) return r def report(self, output_file=sys.stdout): """Print generated model data in human readable format.""" if self.verbose > 1: print('{}'.format(pprint.pformat(self.results['verbose infos'])), file=output_file) for level, cycles in self.results['cycles']: print('{} = {}'.format( level, self.conv_cy(cycles)[self._args.unit]), file=output_file) if self.verbose > 1: if 'memory bandwidth kernel' in self.results: print('memory cycles based on {} kernel with {}'.format( self.results['memory bandwidth kernel'], self.results['memory bandwidth']), file=output_file) if self.verbose > 1: print(file=output_file) print(self.report_data_transfers(), file=output_file)
class Roofline(PerformanceModel): """ Representation of the Roofline model based on simplistic FLOP analysis. more info to follow... """ name = "Roofline" @classmethod def configure_arggroup(cls, parser): """Configure argument parser.""" pass def __init__(self, kernel, machine, args=None, parser=None, cores=1, cache_predictor=LayerConditionPredictor, verbose=0): """ Create roofline model from kernel and machine objects. *kernel* is a Kernel object *machine* describes the machine (cpu, cache and memory) characteristics *args* (optional) are the parsed arguments from the comand line If *args* is None, *asm_block*, *pointer_increment* and *verbose* will be used, otherwise *args* takes precedence. """ self.kernel = kernel self.machine = machine self._args = args self._parser = parser self.results = None if args: self.verbose = self._args.verbose self.cores = self._args.cores if self._args.cache_predictor == 'SIM': self.predictor = CacheSimulationPredictor( self.kernel, self.machine, self.cores) elif self._args.cache_predictor == 'LC': self.predictor = LayerConditionPredictor( self.kernel, self.machine, self.cores) else: raise NotImplementedError( "Unknown cache predictor, only LC (layer condition) and " "SIM (cache simulation with pycachesim) is supported.") else: self.cores = cores self.predictor = cache_predictor(self.kernel, self.machine, self.cores) self.verbose = verbose def calculate_cache_access(self): """Apply cache prediction to generate cache access behaviour.""" self.results = { 'loads': self.predictor.get_loads(), 'stores': self.predictor.get_stores(), 'verbose infos': self.predictor.get_infos(), # only for verbose outputs 'bottleneck level': 0, 'mem bottlenecks': [] } element_size = self.kernel.datatypes_size[self.kernel.datatype] cacheline_size = float(self.machine['cacheline size']) elements_per_cacheline = int(cacheline_size // element_size) total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline # TODO let user choose threads_per_core: threads_per_core = 1 # Compile relevant information # CPU-L1 stats (in bytes!) # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines read_offsets, write_offsets = zip(*list( self.kernel.compile_global_offsets( iteration=range(0, elements_per_cacheline)))) read_offsets = set([ item for sublist in read_offsets if sublist is not None for item in sublist ]) write_offsets = set([ item for sublist in write_offsets if sublist is not None for item in sublist ]) write_streams = len(write_offsets) read_streams = len(read_offsets) + write_streams # write-allocate total_loads = read_streams * element_size total_evicts = write_streams * element_size bw, measurement_kernel = self.machine.get_bandwidth( 0, read_streams, 0, # we do not consider stores to L1 threads_per_core, cores=self.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = Iterations / bytes loaded if total_loads == 0: # This happens in case of full-caching arith_intens = None it_s = None else: arith_intens = 1.0 / (total_loads / elements_per_cacheline) it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s') self.results['mem bottlenecks'].append({ 'performance': self.conv_perf(it_s), 'level': self.machine['memory hierarchy'][0]['level'], 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': total_loads }) self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = self.conv_perf(it_s) # for other cache and memory levels: for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[:-1]: # Compiling stats (in bytes!) total_loads = self.results['loads'][cache_level + 1] * cacheline_size total_stores = self.results['stores'][cache_level + 1] * cacheline_size # choose bw according to cache level and problem # first, compile stream counts at current cache level # write-allocate is allready resolved above read_streams = self.results['loads'][cache_level + 1] write_streams = self.results['stores'][cache_level + 1] # second, try to find best fitting kernel (closest to stream seen stream counts): bw, measurement_kernel = self.machine.get_bandwidth( cache_level + 1, read_streams, write_streams, threads_per_core, cores=self.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = flops / bytes transfered) bytes_transfered = total_loads + total_stores if bytes_transfered == 0: # This happens in case of full-caching arith_intens = float('inf') it_s = PrefixedUnit(float('inf'), 'It/s') else: arith_intens = 1 / (bytes_transfered / elements_per_cacheline) it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s') self.results['mem bottlenecks'].append({ 'performance': self.conv_perf(it_s), 'level': (self.machine['memory hierarchy'][cache_level + 1]['level']), 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': bytes_transfered }) if it_s < self.results.get('min performance', {'It/s': it_s})['It/s']: self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = self.conv_perf(it_s) return self.results def analyze(self): """Run analysis.""" precision = 'DP' if self.kernel.datatype == 'double' else 'SP' self.calculate_cache_access() self.results['max_perf'] = self.conv_perf(self.machine['clock'] * self.cores * \ self.machine['FLOPs per cycle'][precision]['total']) def conv_perf(self, it_s): """Convert performance (It/s) to other units, such as FLOP/s or cy/CL.""" clock = self.machine['clock'] flops_per_it = sum(self.kernel._flops.values()) performance = it_s * flops_per_it performance.unit = 'FLOP/s' element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = int(float( self.machine['cacheline size'])) / element_size cy_cl = clock / it_s * elements_per_cacheline cy_cl.unit = 'cy/CL' cy_it = clock / it_s cy_it.unit = 'cy/It' return { 'It/s': it_s, 'cy/CL': cy_cl, 'cy/It': cy_it, 'FLOP/s': performance } def report(self, output_file=sys.stdout): """Report analysis outcome in human readable form.""" max_perf = self.results['max_perf'] if self._args and self._args.verbose >= 3: print('{}'.format(pformat(self.results)), file=output_file) if self._args and self._args.verbose >= 1: print('{}'.format(pformat(self.results['verbose infos'])), file=output_file) print('Bottlenecks:', file=output_file) print( ' level | a. intensity | performance | peak bandwidth | peak bandwidth kernel', file=output_file) print( '--------+--------------+-----------------+-------------------+----------------------', file=output_file) print(' CPU | | {!s:>15} | |'. format(max_perf[self._args.unit]), file=output_file) for b in self.results['mem bottlenecks']: print( '{level:>7} | {arithmetic intensity:>7.2} It/B | {0!s:>15} |' ' {bandwidth!s:>17} | {bw kernel:<8}'.format( b['performance'][self._args.unit], **b), file=output_file) print('', file=output_file) if self.results['min performance']['FLOP/s'] > max_perf['FLOP/s']: # CPU bound print('CPU bound. {!s} due to CPU max. FLOP/s'.format(max_perf), file=output_file) else: # Cache or mem bound print('Cache or mem bound.', file=output_file) bottleneck = self.results['mem bottlenecks'][ self.results['bottleneck level']] print( '{!s} due to {} transfer bottleneck (with bw from {} benchmark)' .format(bottleneck['performance'][self._args.unit], bottleneck['level'], bottleneck['bw kernel']), file=output_file) print('Arithmetic Intensity: {:.2f} It/B'.format( bottleneck['arithmetic intensity']), file=output_file) if any([ '_Complex' in var_info[0] for var_info in self.kernel.variables.values() ]): print( "WARNING: FLOP counts are probably wrong, because complex flops are counted\n" " as single flops. All other units should not be affected.\n", file=sys.stderr)