예제 #1
0
class ECMData(PerformanceModel):
    """Representation of Data portion of the Execution-Cache-Memory Model."""

    name = "Execution-Cache-Memory (data transfers only)"

    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=CacheSimulationPredictor,
                 verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = {}

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Dispatch to cache predictor to get cache stats."""
        self.results.update({
            'cycles': [],  # will be filled by caclculate_cycles()
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos': self.predictor.get_infos()
        })  # only for verbose outputs

    def calculate_cycles(self):
        """
        Calculate performance model cycles from cache stats.

        calculate_cache_access() needs to have been execute before.
        """
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = float(
            self.machine['cacheline size']) // element_size
        cacheline_size = float(self.machine['cacheline size'])

        loads, stores = (self.predictor.get_loads(),
                         self.predictor.get_stores())

        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[1:]:
            throughput, duplexness = cache_info[
                'non-overlap upstream throughput']

            if type(throughput
                    ) is str and throughput == 'full socket memory bandwidth':
                # Memory transfer
                # we use bandwidth to calculate cycles and then add panalty cycles (if given)

                # choose bw according to cache level and problem
                # first, compile stream counts at current cache level
                # write-allocate is allready resolved in cache predictor
                read_streams = loads[cache_level]
                write_streams = stores[cache_level]
                # second, try to find best fitting kernel (closest to stream seen stream counts):
                threads_per_core = 1
                bw, measurement_kernel = self.machine.get_bandwidth(
                    cache_level, read_streams, write_streams, threads_per_core)

                # calculate cycles
                if duplexness == 'half-duplex':
                    cycles = float(loads[cache_level] + stores[cache_level]) * \
                             float(elements_per_cacheline) * float(element_size) * \
                             float(self.machine['clock']) / float(bw)
                else:  # full-duplex
                    raise NotImplementedError(
                        "full-duplex mode is not (yet) supported for memory transfers."
                    )
                # add penalty cycles for each read stream
                if 'penalty cycles per read stream' in cache_info:
                    cycles += stores[cache_level] * \
                              cache_info['penalty cycles per read stream']

                self.results.update({
                    'memory bandwidth kernel': measurement_kernel,
                    'memory bandwidth': bw
                })
            else:
                # since throughput is given in B/cy, and we need CL/cy:
                throughput = float(throughput) / cacheline_size
                # only cache cycles count
                if duplexness == 'half-duplex':
                    cycles = (loads[cache_level] +
                              stores[cache_level]) / float(throughput)
                elif duplexness == 'full-duplex':
                    cycles = max(loads[cache_level] / float(throughput),
                                 stores[cache_level] / float(throughput))
                else:
                    raise ValueError(
                        "Duplexness of cache throughput may only be 'half-duplex'"
                        "or 'full-duplex', found {} in {}.".format(
                            duplexness, cache_info['name']))

            self.results['cycles'].append((cache_info['level'], cycles))

            self.results[cache_info['level']] = cycles

        return self.results

    def analyze(self):
        """Run complete anaylysis and return results."""
        self.calculate_cache_access()
        self.calculate_cycles()
        self.results['flops per iteration'] = sum(self.kernel._flops.values())

        return self.results

    def conv_cy(self, cy_cl):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(
            self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock / cy_cl * elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s * flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl * elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }

    def report_data_transfers(self):
        cacheline_size = float(self.machine['cacheline size'])
        r = "Data Transfers:\nLevel   | Loads    | Store    |\n"
        loads, stores = (self.predictor.get_loads(),
                         self.predictor.get_stores())
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[1:]:
            r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format(
                self.machine['memory hierarchy'][cache_level - 1]['level'] +
                '-' + cache_info['level'], loads[cache_level] * cacheline_size,
                stores[cache_level] * cacheline_size))
        return r

    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        if self.verbose > 1:
            print('{}'.format(pprint.pformat(self.results['verbose infos'])),
                  file=output_file)

        for level, cycles in self.results['cycles']:
            print('{} = {}'.format(level,
                                   self.conv_cy(cycles)[self._args.unit]),
                  file=output_file)

        if self.verbose > 1:
            if 'memory bandwidth kernel' in self.results:
                print('memory cycles based on {} kernel with {}'.format(
                    self.results['memory bandwidth kernel'],
                    self.results['memory bandwidth']),
                      file=output_file)

        if self.verbose > 1:
            print(file=output_file)
            print(self.report_data_transfers(), file=output_file)
예제 #2
0
파일: ecm.py 프로젝트: RRZE-HPC/kerncraft
class ECMData(PerformanceModel):
    """Representation of Data portion of the Execution-Cache-Memory Model."""

    name = "Execution-Cache-Memory (data transfers only)"

    def __init__(self, kernel, machine, args=None, parser=None, cores=1,
                 cache_predictor=CacheSimulationPredictor, verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = {}

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and "
                                          "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine, self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Dispatch to cache predictor to get cache stats."""
        self.results.update({
                        'cycles': [],  # will be filled by caclculate_cycles()
                        'misses': self.predictor.get_misses(),
                        'hits': self.predictor.get_hits(),
                        'evicts': self.predictor.get_evicts(),
                        'verbose infos': self.predictor.get_infos()})  # only for verbose outputs

    def calculate_cycles(self):
        """
        Calculate performance model cycles from cache stats.

        calculate_cache_access() needs to have been execute before.
        """
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = float(self.machine['cacheline size']) // element_size
        iterations_per_cacheline = (sympy.Integer(self.machine['cacheline size']) /
                                    sympy.Integer(self.kernel.bytes_per_iteration))
        self.results['iterations per cacheline'] = iterations_per_cacheline
        cacheline_size = float(self.machine['cacheline size'])

        loads, stores = (self.predictor.get_loads(), self.predictor.get_stores())

        for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]:
            throughput, duplexness = cache_info['non-overlap upstream throughput']

            if type(throughput) is str and throughput == 'full socket memory bandwidth':
                # Memory transfer
                # we use bandwidth to calculate cycles and then add panalty cycles (if given)

                # choose bw according to cache level and problem
                # first, compile stream counts at current cache level
                # write-allocate is allready resolved in cache predictor
                read_streams = loads[cache_level]
                write_streams = stores[cache_level]
                # second, try to find best fitting kernel (closest to stream seen stream counts):
                threads_per_core = 1
                bw, measurement_kernel = self.machine.get_bandwidth(
                    cache_level, read_streams, write_streams, threads_per_core)

                # calculate cycles
                if duplexness == 'half-duplex':
                    cycles = float(loads[cache_level] + stores[cache_level]) * \
                             float(elements_per_cacheline) * float(element_size) * \
                             float(self.machine['clock']) / float(bw)
                else:  # full-duplex
                    raise NotImplementedError(
                        "full-duplex mode is not (yet) supported for memory transfers.")
                # add penalty cycles for each read stream
                if 'penalty cycles per read stream' in cache_info:
                    cycles += stores[cache_level] * \
                              cache_info['penalty cycles per read stream']

                self.results.update({
                    'memory bandwidth kernel': measurement_kernel,
                    'memory bandwidth': bw})
            else:
                # since throughput is given in B/cy, and we need CL/cy:
                throughput = float(throughput) / cacheline_size
                # only cache cycles count
                if duplexness == 'half-duplex':
                    cycles = (loads[cache_level] + stores[cache_level]) / float(throughput)
                elif duplexness == 'full-duplex':
                    cycles = max(loads[cache_level] / float(throughput),
                                 stores[cache_level] / float(throughput))
                else:
                    raise ValueError("Duplexness of cache throughput may only be 'half-duplex'"
                                     "or 'full-duplex', found {} in {}.".format(
                        duplexness, cache_info['name']))

            self.results['cycles'].append((cache_info['level'], cycles))

            self.results[cache_info['level']] = cycles

        return self.results

    def analyze(self):
        """Run complete anaylysis and return results."""
        self.calculate_cache_access()
        self.calculate_cycles()
        self.results['flops per iteration'] = sum(self.kernel._flops.values())

        return self.results

    def conv_cy(self, cy_cl):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock/cy_cl*elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s*flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl*elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {'It/s': it_s,
                'cy/CL': cy_cl,
                'cy/It': cy_it,
                'FLOP/s': performance}

    def report_data_transfers(self):
        cacheline_size = float(self.machine['cacheline size'])
        r = "Data Transfers:\nLevel   | Loads    | Store    |\n"
        loads, stores = (self.predictor.get_loads(), self.predictor.get_stores())
        for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]:
            r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format(
                self.machine['memory hierarchy'][cache_level-1]['level']+'-'+cache_info['level'],
                loads[cache_level] * cacheline_size,
                stores[cache_level] * cacheline_size))
        return r

    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        if self.verbose > 1:
            print('{}'.format(pprint.pformat(self.results['verbose infos'])), file=output_file)

        for level, cycles in self.results['cycles']:
            print('{} = {}'.format(
                level, self.conv_cy(cycles)[self._args.unit]), file=output_file)

        if self.verbose > 1:
            if 'memory bandwidth kernel' in self.results:
                print('memory cycles based on {} kernel with {}'.format(
                          self.results['memory bandwidth kernel'],
                          self.results['memory bandwidth']),
                      file=output_file)

        if self.verbose > 1:
            print(file=output_file)
            print(self.report_data_transfers(), file=output_file)
예제 #3
0
class Roofline(PerformanceModel):
    """
    Representation of the Roofline model based on simplistic FLOP analysis.

    more info to follow...
    """

    name = "Roofline"

    @classmethod
    def configure_arggroup(cls, parser):
        """Configure argument parser."""
        pass

    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=LayerConditionPredictor,
                 verbose=0):
        """
        Create roofline model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line


        If *args* is None, *asm_block*, *pointer_increment* and *verbose* will be used, otherwise
        *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Apply cache prediction to generate cache access behaviour."""
        self.results = {
            'loads': self.predictor.get_loads(),
            'stores': self.predictor.get_stores(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set([
            item for sublist in read_offsets if sublist is not None
            for item in sublist
        ])
        write_offsets = set([
            item for sublist in write_offsets if sublist is not None
            for item in sublist
        ])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams,
            0,  # we do not consider stores to L1 
            threads_per_core,
            cores=self.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = Iterations / bytes loaded
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            it_s = None
        else:
            arith_intens = 1.0 / (total_loads / elements_per_cacheline)
            it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s')

        self.results['mem bottlenecks'].append({
            'performance':
            self.conv_perf(it_s),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        self.results['bottleneck level'] = len(
            self.results['mem bottlenecks']) - 1
        self.results['min performance'] = self.conv_perf(it_s)

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_loads = self.results['loads'][cache_level +
                                                1] * cacheline_size
            total_stores = self.results['stores'][cache_level +
                                                  1] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['loads'][cache_level + 1]
            write_streams = self.results['stores'][cache_level + 1]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_loads + total_stores

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                it_s = PrefixedUnit(float('inf'), 'It/s')
            else:
                arith_intens = 1 / (bytes_transfered / elements_per_cacheline)
                it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s')

            self.results['mem bottlenecks'].append({
                'performance':
                self.conv_perf(it_s),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if it_s < self.results.get('min performance',
                                       {'It/s': it_s})['It/s']:
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = self.conv_perf(it_s)

        return self.results

    def analyze(self):
        """Run analysis."""
        precision = 'DP' if self.kernel.datatype == 'double' else 'SP'
        self.calculate_cache_access()

        self.results['max_perf'] = self.conv_perf(self.machine['clock'] * self.cores * \
            self.machine['FLOPs per cycle'][precision]['total'])

    def conv_perf(self, it_s):
        """Convert performance (It/s) to other units, such as FLOP/s or cy/CL."""
        clock = self.machine['clock']
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s * flops_per_it
        performance.unit = 'FLOP/s'
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(float(
            self.machine['cacheline size'])) / element_size
        cy_cl = clock / it_s * elements_per_cacheline
        cy_cl.unit = 'cy/CL'
        cy_it = clock / it_s
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }

    def report(self, output_file=sys.stdout):
        """Report analysis outcome in human readable form."""
        max_perf = self.results['max_perf']

        if self._args and self._args.verbose >= 3:
            print('{}'.format(pformat(self.results)), file=output_file)

        if self._args and self._args.verbose >= 1:
            print('{}'.format(pformat(self.results['verbose infos'])),
                  file=output_file)
            print('Bottlenecks:', file=output_file)
            print(
                '  level | a. intensity |   performance   |   peak bandwidth  | peak bandwidth kernel',
                file=output_file)
            print(
                '--------+--------------+-----------------+-------------------+----------------------',
                file=output_file)
            print('    CPU |              | {!s:>15} |                   |'.
                  format(max_perf[self._args.unit]),
                  file=output_file)
            for b in self.results['mem bottlenecks']:
                print(
                    '{level:>7} | {arithmetic intensity:>7.2} It/B | {0!s:>15} |'
                    ' {bandwidth!s:>17} | {bw kernel:<8}'.format(
                        b['performance'][self._args.unit], **b),
                    file=output_file)
            print('', file=output_file)

        if self.results['min performance']['FLOP/s'] > max_perf['FLOP/s']:
            # CPU bound
            print('CPU bound. {!s} due to CPU max. FLOP/s'.format(max_perf),
                  file=output_file)
        else:
            # Cache or mem bound
            print('Cache or mem bound.', file=output_file)

            bottleneck = self.results['mem bottlenecks'][
                self.results['bottleneck level']]
            print(
                '{!s} due to {} transfer bottleneck (with bw from {} benchmark)'
                .format(bottleneck['performance'][self._args.unit],
                        bottleneck['level'], bottleneck['bw kernel']),
                file=output_file)
            print('Arithmetic Intensity: {:.2f} It/B'.format(
                bottleneck['arithmetic intensity']),
                  file=output_file)

        if any([
                '_Complex' in var_info[0]
                for var_info in self.kernel.variables.values()
        ]):
            print(
                "WARNING: FLOP counts are probably wrong, because complex flops are counted\n"
                "         as single flops. All other units should not be affected.\n",
                file=sys.stderr)