Exemplo n.º 1
0
        def runPWLoopModel(linfo, my_params):
            result = {}

            result['linenum'] = linfo['linenum']
            numSweeps = doParamSubs(linfo['sweeps'], my_params)
            result['range'] = (doParamSubs(numIters(linfo['ranges'][0]), my_params), \
                               doParamSubs(numIters(linfo['ranges'][1]), my_params), \
                               doParamSubs(numIters(linfo['ranges'][2]), my_params))
            blockx = my_params['X Block Size'] + (result['range'][0] -
                                                  my_params['X Problem Size'])
            blocky = my_params['Y Block Size'] + (result['range'][1] -
                                                  my_params['Y Problem Size'])
            blockz = my_params['Z Block Size'] + (result['range'][2] -
                                                  my_params['Z Problem Size'])
            result['block'] = [blockx, blocky, blockz]
            numBlocks = (my_params['X Problem Size'] * my_params['Y Problem Size'] * my_params['Z Problem Size']) / \
                         (my_params['X Block Size'] * my_params['Y Block Size'] * my_params['Z Block Size'])
            result['numBlocks'] = numBlocks

            # registers
            result['GPRegs'] = doParamSubs(
                linfo['registers']['ints'] + linfo['registers']['ptrs'],
                my_params)
            result['FPRegs'] = doParamSubs(linfo['registers']['floats'],
                                           my_params)
            result['regAlloc'] = regAllocModel(linfo, my_params)

            # save working set and memory traffic
            result['WSFinal'] = doParamSubs(
                my_params['Word Size'] * linfo['WS']['sizeBlocks'], my_params)
            if result['WSFinal'] <= my_params['$/thread group (kB)'] * 2**10:
                result['BWFinal'] = numBlocks * doParamSubs(my_params[ 'R cost'] * linfo['BW']['sizeBlocks']['R'] + \
                                                            my_params[ 'W cost'] * linfo['BW']['sizeBlocks']['W'] + \
                                                            my_params['RW cost'] * linfo['BW']['sizeBlocks']['RW'], my_params)
            else:
                # if not enough cache, punt on this method
                result['BWFinal'] = float('inf')

            # number of flops and weighted flops
            numCellIters = numSweeps * numBlocks * prod(result['block'])
            result['adds'] = numCellIters * doParamSubs(
                linfo['flops']['adds'], my_params)
            result['multiplies'] = numCellIters * doParamSubs(
                linfo['flops']['multiplies'], my_params)
            result['divides'] = numCellIters * doParamSubs(
                linfo['flops']['divides'], my_params)
            result['specials'] = numCellIters * doParamSubs(
                linfo['flops']['specials'], my_params)
            result['flops'] = result['adds'] + result['multiplies'] + result[
                'divides'] + result['specials']
            result['wflops'] = result['adds'] + result['multiplies'] + my_params['Division Cost'] * result['divides'] + \
                                                                       my_params['Special Cost'] * result['specials']

            # arithmetic intensity
            if result['wflops'] != 0:
                result['BF'] = float(result['BWFinal']) / result['wflops']
            else:
                result['BF'] = float('nan')

            # execution time
            result['cputime'] = float(result['wflops']) / \
                                (my_params['Gflop/s/thread'] * my_params['Threads'] * 10**9)
            result['ramtime'] = float(result['BWFinal']) / \
                                (my_params['GB/s/thread'] * my_params['Threads'] * 2**30)
            # assume perfect overlap
            if result['cputime'] > result['ramtime']:
                result['ramtime'] = 0
            else:
                result['cputime'] = 0
            result['time'] = max(result['cputime'], result['ramtime'])

            return result
Exemplo n.º 2
0
 def iter_n(self):
     return numIters(self.range) / self.stride
Exemplo n.º 3
0
        def runLoopModel(linfo, my_params):
            result = {}

            result['linenum'] = linfo['linenum']
            result['range'] = (doParamSubs(numIters(linfo['ranges'][0]), my_params), \
                               doParamSubs(numIters(linfo['ranges'][1]), my_params), \
                               doParamSubs(numIters(linfo['ranges'][2]), my_params))
            blockx = my_params['X Block Size']
            blocky = my_params['Y Block Size']
            blockz = my_params['Z Block Size']
            result['block'] = [blockx, blocky, blockz]
            result['numBlocks'] = float(prod(result['range'])) / prod(
                result['block'])

            # round blockx up to nearest cache line multiple
            CLWords = my_params['Cache Line Size'] / my_params['Word Size']
            blockx = math.ceil(float(blockx) / CLWords) * CLWords

            # registers
            result['GPRegs'] = doParamSubs(
                linfo['registers']['ints'] + linfo['registers']['ptrs'],
                my_params)
            result['FPRegs'] = doParamSubs(linfo['registers']['floats'],
                                           my_params)
            result['regAlloc'] = regAllocModel(linfo, my_params)

            # Compute working sets and memory traffic for read-only arrays
            arrays = []
            for ainfo in analyze.getR(linfo['arrays']):
                array = {}
                array['name'] = ainfo['name']
                array['access'] = map(lambda x, y: x + diff(y)[0],
                                      result['block'], ainfo['ghost'])
                accessx = array['access'][0]
                accessy = array['access'][1]
                accessz = array['access'][2]

                # round accessx up to nearest cache line multiple
                accessx = math.ceil(float(accessx) / CLWords) * CLWords

                # WS calculation for generic case
                array['WS'] = {'all'  : {'plane' : my_params['Word Size'] * ainfo['WS']['numPlanes'] * \
                                                   accessx * accessy, \
                                         'pencil': my_params['Word Size'] * ainfo['WS']['numPencils'] * \
                                                   accessx, \
                                         'cell'  : my_params['Word Size'] * ainfo['WS']['numCells'], \
                                        }, \
                               'reuse': {'plane' : my_params['Word Size'] * ainfo['WS']['numReusePlanes'] * \
                                                   accessx * accessy, \
                                         'pencil': my_params['Word Size'] * ainfo['WS']['numReusePencils'] * \
                                                   accessx, \
                                         'cell'  : my_params['Word Size'] * ainfo['WS']['numReuseCells'], \
                                        }, \
                              }
                # fix the plane WS for faces-only stencils
                if ainfo['stenciltype'] == 'faces':

                    array['WS']['all']['plane'] = my_params['Word Size'] * \
                        ((ainfo['WS']['numPlanes'] - 1) * (blockx * blocky) + \
                         1 * (blockx * accessy + accessx * blocky - blockx * blocky))
                    array['WS']['all']['pencil'] = my_params['Word Size'] * \
                        ((ainfo['WS']['numPencils'] - 1) * blockx + 1 * accessx)

                    array['WS']['reuse']['plane'] = my_params['Word Size'] * \
                        ((ainfo['WS']['numReusePlanes'] - 1) * (blockx * blocky) + \
                         1 * (blockx * accessy + accessx * blocky - blockx * blocky))
                    array['WS']['reuse']['pencil'] = my_params['Word Size'] * \
                        ((ainfo['WS']['numReusePencils'] - 1) * blockx + 1 * accessx)

                # BW calculation for generic case
                array['BW'] = {'block' : result['numBlocks'] * ainfo['BW']['numCopies'] * my_params['R cost'] * \
                                         accessx * accessy * accessz, \
                               'plane' : result['numBlocks'] * ainfo['BW']['numPlanes'] * my_params['R cost'] * \
                                         accessx * accessy * blockz, \
                               'pencil': result['numBlocks'] * ainfo['BW']['numPencils'] * my_params['R cost'] * \
                                         accessx * blocky * blockz, \
                               'cell'  : result['numBlocks'] * ainfo['BW']['numCells'] * my_params['R cost'] * \
                                         blockx * blocky * blockz, \
                              }
                # fix the block and plane BW for faces-only stencils
                if ainfo['stenciltype'] == 'faces':
                    array['BW']['block'] = result['numBlocks'] * ainfo['BW']['numCopies'] * my_params['R cost'] * \
                        (accessx * blocky * blockz + blockx * accessy * blockz + \
                         blockx * blocky * accessz - 2 * blockx * blocky * blockz)
                    array['BW']['plane'] = result['numBlocks'] * my_params['R cost'] * \
                        ((ainfo['BW']['numPlanes'] - 1) * (blockx * blocky) + \
                         1 * (accessx * blocky + blockx * accessy - blockx * blocky)) * blockz
                    array['BW']['pencil'] = result['numBlocks'] * my_params['R cost'] * \
                        ((ainfo['BW']['numPencils'] - 1) * blockx + 1 * accessx) * blocky * blockz

                arrays.append(array)

            sumWSAllPlane = sum(map(lambda x: x['WS']['all']['plane'], arrays))
            sumWSAllPencil = sum(
                map(lambda x: x['WS']['all']['pencil'], arrays))
            sumWSAllCell = sum(map(lambda x: x['WS']['all']['cell'], arrays))
            sumWSReusePlane = sum(
                map(lambda x: x['WS']['reuse']['plane'], arrays))
            sumWSReusePencil = sum(
                map(lambda x: x['WS']['reuse']['pencil'], arrays))
            sumWSReuseCell = sum(
                map(lambda x: x['WS']['reuse']['cell'], arrays))

            sumBWBlock = sum(map(lambda x: x['BW']['block'], arrays))
            sumBWPlane = sum(map(lambda x: x['BW']['plane'], arrays))
            sumBWPencil = sum(map(lambda x: x['BW']['pencil'], arrays))
            sumBWCell = sum(map(lambda x: x['BW']['cell'], arrays))

            result['arrays'] = arrays

            # Compute working sets for different scenarios
            result['WS'] = {'all'   : {'plane' : sumWSAllPlane + my_params['Word Size'] * \
                                                 (linfo['WS']['numPlanes']['RW'] + linfo['WS']['numPlanes']['W']) * \
                                                 blockx * blocky, \
                                       'pencil': sumWSAllPencil + my_params['Word Size'] * \
                                                 (linfo['WS']['numPencils']['RW'] + linfo['WS']['numPencils']['W']) * \
                                                 blockx, \
                                       'cell'  : sumWSAllCell + my_params['Word Size'] * \
                                                 (linfo['WS']['numCells']['RW'] + linfo['WS']['numCells']['W']), \
                                      }, \
                            'stream': {'plane' : sumWSAllPlane, \
                                       'pencil': sumWSAllPencil, \
                                       'cell'  : sumWSAllCell, \
                                      }, \
                            'reuse' : {'plane' : sumWSReusePlane, \
                                       'pencil': sumWSReusePencil, \
                                       'cell'  : sumWSReuseCell, \
                                      }, \
                           }

            # Determine actual working sets based on cache utilization policy
            if my_params['NTA Hints']:
                result['WS']['actual'] = result['WS']['reuse']
            elif my_params['Streaming Writes']:
                result['WS']['actual'] = result['WS']['stream']
            else:
                result['WS']['actual'] = result['WS']['all']

            # Compute memory traffic for different reuse scenarios
            numSweeps = doParamSubs(linfo['sweeps'], my_params)
            RWWBW = (linfo['BW']['numArrays']['RW'] * my_params['RW cost'] + \
                     linfo['BW']['numArrays']['W' ] * my_params['W cost' ]) * result['numBlocks'] * blockx * blocky * blockz
            result['BW'] = {'block' : numSweeps * (sumBWBlock  + RWWBW), \
                            'plane' : numSweeps * (sumBWPlane  + RWWBW), \
                            'pencil': numSweeps * (sumBWPencil + RWWBW), \
                            'cell'  : numSweeps * (sumBWCell   + RWWBW), \
                           }

            # do symbolic parameter substitutions
            for x in result['WS'].values():
                for (y, z) in x.iteritems():
                    x[y] = doParamSubs(z, my_params)
            for (y, z) in result['BW'].iteritems():
                result['BW'][y] = doParamSubs(z, my_params)

            # bandwidth should be no worse than model prediction for cases with worse reuse,
            #   but model approximations cause different inaccuracies for different cases
            result['BW']['pencil'] = min(result['BW']['pencil'],
                                         result['BW']['cell'])
            result['BW']['plane'] = min(result['BW']['plane'],
                                        result['BW']['pencil'])
            result['BW']['block'] = min(result['BW']['block'],
                                        result['BW']['plane'])

            # if there's no difference between memory traffic, working set is effectively reduced
            if result['BW']['pencil'] == result['BW']['cell']:
                result['WS']['actual']['cell'] = 0
            if result['BW']['plane'] == result['BW']['pencil']:
                result['WS']['actual']['pencil'] = result['WS']['actual'][
                    'cell']
            if result['BW']['block'] == result['BW']['plane']:
                result['WS']['actual']['plane'] = result['WS']['actual'][
                    'pencil']

            # Compute "actual" memory traffic based on type of reuse given available cache
            if result['WS']['actual'][
                    'plane'] <= my_params['$/thread group (kB)'] * 2**10:
                result['BW']['actual'] = result['BW']['block']
            elif result['WS']['actual'][
                    'pencil'] <= my_params['$/thread group (kB)'] * 2**10:
                result['BW']['actual'] = result['BW']['plane']
            elif result['WS']['actual'][
                    'cell'] <= my_params['$/thread group (kB)'] * 2**10:
                result['BW']['actual'] = result['BW']['pencil']
            else:
                result['BW']['actual'] = result['BW']['cell']

            # save final WS and BW
            result['WSFinal'] = result['WS']['actual']['plane']
            result['BWFinal'] = result['BW']['actual']

            # number of flops and weighted flops
            result['adds'] = numSweeps * prod(result['range']) * doParamSubs(
                linfo['flops']['adds'], my_params)
            result['multiplies'] = numSweeps * prod(
                result['range']) * doParamSubs(linfo['flops']['multiplies'],
                                               my_params)
            result['divides'] = numSweeps * prod(
                result['range']) * doParamSubs(linfo['flops']['divides'],
                                               my_params)
            result['specials'] = numSweeps * prod(
                result['range']) * doParamSubs(linfo['flops']['specials'],
                                               my_params)
            result['flops'] = result['adds'] + result['multiplies'] + result[
                'divides'] + result['specials']
            result['wflops'] = result['adds'] + result['multiplies'] + my_params['Division Cost'] * result['divides'] + \
                                                                       my_params['Special Cost'] * result['specials']

            # arithmetic intensity
            if result['wflops'] != 0:
                result['BF'] = float(result['BWFinal']) / result['wflops']
            else:
                result['BF'] = float('nan')

            # execution time
            result['cputime'] = float(result['wflops']) / \
                                (my_params['Gflop/s/thread'] * my_params['Threads'] * 10**9)
            result['ramtime'] = float(result['BW']['actual']) / \
                                (my_params['GB/s/thread'] * my_params['Threads'] * 2**30)
            # assume perfect overlap
            if result['cputime'] > result['ramtime']:
                result['ramtime'] = 0
            else:
                result['cputime'] = 0
            result['time'] = max(result['cputime'], result['ramtime'])

            return result
Exemplo n.º 4
0
            def mergeLoops(linfo1, linfo2):
                def mergeAccess(array1, array2):
                    access1 = array1['access']
                    access2 = array2['access']
                    # HACK: if there's a numCopies mismatch, try to recognize if there's an implicit
                    # knowledge of the number of elements in the array
                    if array1['copies'] == 1 and array2['copies'] != 1:
                        maxIndex = max(map(lambda x: x[0], access1.keys()))
                        print >> sys.stderr, 'Warning: During merge of %s: assuming %s equals %s' % \
                                             (array2['name'], array2['copies'], maxIndex)
                        if access2.keys() != [(0, )]:
                            raise Exception('unsupported access merge')
                        for idx in xrange(1, maxIndex + 1):
                            if (idx, ) in access1:
                                access1[(idx, )]['reads'] += access2[(
                                    0, )]['reads']
                                access1[(idx, )]['writes'] += access2[(
                                    0, )]['writes']
                            else:
                                access1[(idx, )] = access2[(0, )]
                    else:
                        for index in access2:
                            if index in access1:
                                access1[index]['reads'] += access2[index][
                                    'reads']
                                access1[index]['writes'] += access2[index][
                                    'writes']
                            else:
                                access1[index] = access2[index]
                    return access1

                iters = numIters(linfo2['range'])
                if linfo2['loops']:
                    raise Exception(
                        'child loop cannot have nested loops of its own')

                # process flops
                for floptype in ['adds', 'multiplies', 'divides', 'specials']:
                    linfo1['flops'][floptype] = linfo1['flops'][
                        floptype] + iters * linfo2['flops'][floptype]

                # process scalars
                for scalar2 in linfo2['scalars']:
                    scalar2['reads'] *= iters
                    scalar2['writes'] *= iters
                    scalar1 = getMember(linfo1['scalars'], scalar2['name'])
                    if scalar1:
                        scalar1['reads'] += scalar2['reads']
                        scalar1['writes'] += scalar2['writes']
                    else:
                        linfo1['scalars'].append(scalar2)

                # process spatial arrays and state arrays
                for category in ['arrays', 'stateArrays']:
                    for array2 in linfo2[category]:
                        if category == 'arrays' and isSpeciesArray(array2['name']) or \
                           category == 'stateArrays' and array2['arraytype'] == 'relStateArray':
                            array2['copies'] = iters * array2['copies']
                        else:
                            for access in array2['access'].values():
                                access['reads'] *= iters
                                access['writes'] *= iters
                        array1 = getMember(linfo1[category], array2['name'])
                        if array1:
                            array1['access'] = mergeAccess(array1, array2)
                        else:
                            linfo1[category].append(array2)
Exemplo n.º 5
0
        def flattenLoopNest(linfo):
            def getRanges(linfo):
                result = [[linfo['loopvar']], [linfo['range']],
                          [linfo['stride']]]
                if len(linfo['loops']) > 0:
                    temp = getRanges(linfo['loops'][0])
                    result[0].extend(temp[0])
                    result[1].extend(temp[1])
                    result[2].extend(temp[2])
                return result

            def isSpatial(x):
                return re.search('d?lo\(\d\)', x[0]) and \
                       re.search('d?hi\(\d\)', x[1])

            # triply nested spatial loop
            def isType0(ranges):
                return len(ranges) == 3 and all(map(isSpatial, ranges))

            # quadruply nested loop with spatial loops 2, 3, and 4
            def isType1(ranges):
                return len(ranges) == 4 and all(map(isSpatial, ranges[1:]))

            # quadruply nested loop with spatial loops 1, 2, and 3
            def isType2(ranges):
                return len(ranges) == 4 and all(map(isSpatial, ranges[:-1]))

            # merge info from child loop linfo2 into parent loop linfo1
            def mergeLoops(linfo1, linfo2):
                def mergeAccess(array1, array2):
                    access1 = array1['access']
                    access2 = array2['access']
                    # HACK: if there's a numCopies mismatch, try to recognize if there's an implicit
                    # knowledge of the number of elements in the array
                    if array1['copies'] == 1 and array2['copies'] != 1:
                        maxIndex = max(map(lambda x: x[0], access1.keys()))
                        print >> sys.stderr, 'Warning: During merge of %s: assuming %s equals %s' % \
                                             (array2['name'], array2['copies'], maxIndex)
                        if access2.keys() != [(0, )]:
                            raise Exception('unsupported access merge')
                        for idx in xrange(1, maxIndex + 1):
                            if (idx, ) in access1:
                                access1[(idx, )]['reads'] += access2[(
                                    0, )]['reads']
                                access1[(idx, )]['writes'] += access2[(
                                    0, )]['writes']
                            else:
                                access1[(idx, )] = access2[(0, )]
                    else:
                        for index in access2:
                            if index in access1:
                                access1[index]['reads'] += access2[index][
                                    'reads']
                                access1[index]['writes'] += access2[index][
                                    'writes']
                            else:
                                access1[index] = access2[index]
                    return access1

                iters = numIters(linfo2['range'])
                if linfo2['loops']:
                    raise Exception(
                        'child loop cannot have nested loops of its own')

                # process flops
                for floptype in ['adds', 'multiplies', 'divides', 'specials']:
                    linfo1['flops'][floptype] = linfo1['flops'][
                        floptype] + iters * linfo2['flops'][floptype]

                # process scalars
                for scalar2 in linfo2['scalars']:
                    scalar2['reads'] *= iters
                    scalar2['writes'] *= iters
                    scalar1 = getMember(linfo1['scalars'], scalar2['name'])
                    if scalar1:
                        scalar1['reads'] += scalar2['reads']
                        scalar1['writes'] += scalar2['writes']
                    else:
                        linfo1['scalars'].append(scalar2)

                # process spatial arrays and state arrays
                for category in ['arrays', 'stateArrays']:
                    for array2 in linfo2[category]:
                        if category == 'arrays' and isSpeciesArray(array2['name']) or \
                           category == 'stateArrays' and array2['arraytype'] == 'relStateArray':
                            array2['copies'] = iters * array2['copies']
                        else:
                            for access in array2['access'].values():
                                access['reads'] *= iters
                                access['writes'] *= iters
                        array1 = getMember(linfo1[category], array2['name'])
                        if array1:
                            array1['access'] = mergeAccess(array1, array2)
                        else:
                            linfo1[category].append(array2)

            [loopvars, ranges, strides] = getRanges(linfo)
            if not (isType0(ranges) or isType1(ranges) or isType2(ranges)):
                if options.flag_warn:
                    print >> sys.stderr, "Warning: ignoring loop nest: %s" % str(
                        ranges)
                return

            # HACK: figure out loop type based on ranges
            sweeps = 1
            if isType1(ranges):
                # species sweep in outermost loop
                sweeps = numIters(linfo['range'])
                linfo = linfo['loops'][0]
                loopvars = loopvars[1:]
                ranges = ranges[1:]
                strides = strides[1:]
            elif isType2(ranges):
                # species sweep(s) inside third spatial loop level
                loopvars = loopvars[0:-1]
                ranges = ranges[0:-1]
                strides = strides[0:-1]
                linfo3 = linfo['loops'][0]['loops'][0]
                for linfo4 in linfo3['loops']:
                    mergeLoops(linfo3, linfo4)
                del linfo3['loops']

            # use data in innermost loop
            linfo['loops'][0]['loops'][0]['linenum'] = linfo['linenum']
            linfo = linfo['loops'][0]['loops'][0]
            del linfo['loopvar']
            del linfo['range']
            del linfo['stride']
            loopvars.reverse()
            ranges.reverse()
            strides.reverse()
            linfo['loopvars'] = loopvars
            linfo['ranges'] = ranges
            linfo['strides'] = strides
            linfo['sweeps'] = sweeps

            return linfo
Exemplo n.º 6
0
        def writeDetails(f):
            f.write('DETAILED READ-ONLY ARRAY INFO\n\n')
            for fname in sorted(self.info.keys()):
                f.write('Function:|%s\n' % (fname))
                function = self.info[fname]
                for loop in function['loops']:

                    # print header
                    loopid = '%s.%d' % (fname, loop['linenum'])
                    f.write('Loop line num:|%d\n' % (loop['linenum']))
                    f.write('|||Iteration Space|||Block Iteration Space|||||Block Access Space|||||Bandwidth|||' + \
                            'Working Set|||Reuse WS|||WS (all reads)|||WS (reuse only)|||GBytes transferred/sweep\n')
                    f.write('||Name|X|Y|Z|X|Y|Z|X (cache line)|Num Blocks|' + \
                            'X#$CLABEL{accessx}|Y#$CLABEL{accessy}|Z#$CLABEL{accessz}|X (cache line)#$CLABEL{accessxcl}|' + \
                            'Copies#$CLABEL{aBWCopies}|Planes#$CLABEL{aBWPlanes}|' + \
                            'Pencils#$CLABEL{aBWPencils}|Cells#$CLABEL{aBWCells}|' + \
                            'Planes#$CLABEL{aWSPlanes}|Pencils#$CLABEL{aWSPencils}|Cells#$CLABEL{aWSCells}|' + \
                            'Planes#$CLABEL{aWSReusePlanes}|Pencils#$CLABEL{aWSReusePencils}|Cells#$CLABEL{aWSReuseCells}|' + \
                            'WS/plane/core (kB)#$CLABEL{aWSPlane}|' + \
                            'WS/pencil/core (kB)#$CLABEL{aWSPencil}|' + \
                            'WS/cell/core (kB)#$CLABEL{aWSCell}|' + \
                            'WS/plane/core (kB)#$CLABEL{aWSReusePlane}|' + \
                            'WS/pencil/core (kB)#$CLABEL{aWSReusePencil}|' + \
                            'WS/cell/core (kB)#$CLABEL{aWSReuseCell}|' + \
                            'Reuse between planes#$CLABEL{aBlockGbytes}|' + \
                            'Reuse between pencils#$CLABEL{aPlaneGbytes}|' + \
                            'Reuse within pencils#$CLABEL{aPencilGbytes}|' + \
                            'No reuse within pencils#$CLABEL{aCellGbytes}|' + \
                            '\n')

                    readArrays = sorted(analyze.getR(loop['arrays']),
                                        key=lambda x: x['name'])
                    for array in readArrays:

                        # name
                        f.write('||%s' % (array['name']))

                        # iteration space
                        f.write('|=%s' %
                                doRefSubs(numIters(loop['ranges'][0])))
                        f.write('|=%s' %
                                doRefSubs(numIters(loop['ranges'][1])))
                        f.write('|=%s' %
                                doRefSubs(numIters(loop['ranges'][2])))

                        # block iteration space (with also X rounded up to nearest cache line) and num blocks
                        f.write('|=%s' % getRefs('X Block Size'))
                        f.write('|=%s' % getRefs('Y Block Size'))
                        f.write('|=%s' % getRefs('Z Block Size'))
                        f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \
                                                                                      'Cache Line Size', 'Word Size'))
                        f.write('|=%s/%s*%s/%s*%s/%s' %
                                getRefs('iterx', 'blockx', 'itery', 'blocky',
                                        'iterz', 'blockz'))

                        # block access space (with also X rounded up to nearest cache line)
                        f.write('|=%s+%d' %
                                (getRef('blockx'), diff(array['ghost'][0])[0]))
                        f.write('|=%s+%d' %
                                (getRef('blocky'), diff(array['ghost'][1])[0]))
                        f.write('|=%s+%d' %
                                (getRef('blockz'), diff(array['ghost'][2])[0]))
                        f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('accessx', 'Cache Line Size', 'Word Size', \
                                                                                       'Cache Line Size', 'Word Size'))

                        # bandwidth and working set figures from access pattern analysis
                        f.write('|=%s'*10 % tuple(map(doRefSubs, [array['copies'], array['BW']['numPlanes'], \
                                array['BW']['numPencils'], array['BW']['numCells'], array['WS']['numPlanes'], \
                                array['WS']['numPencils'], array['WS']['numCells'], array['WS']['numReusePlanes'], \
                                array['WS']['numReusePencils'], array['WS']['numReuseCells']])))

                        # working set size formulas (kB)

                        if array['stenciltype'] == 'faces':
                            f.write('|=%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))/2^10' % \
                                getRefs('Word Size', 'aBWCopies', \
                                        'aWSPlanes', 'blockxcl', 'blocky', \
                                        'blockxcl', 'accessy', \
                                        'accessxcl', 'blocky', \
                                        'blockxcl', 'blocky'))
                        else:
                            f.write('|=%s*%s*%s*%s*%s/2^10' % \
                                getRefs('Word Size', 'aBWCopies', 'aWSPlanes', 'accessxcl', 'accessy'))

                        if array['stenciltype'] == 'faces':
                            f.write('|=%s*%s*((%s-1)*(%s)+1*(%s))/2^10' % \
                                getRefs('Word Size', 'aBWCopies', 'aWSPencils', 'blockxcl', 'accessxcl'))
                        else:
                            f.write('|=%s*%s*%s*%s/2^10' %
                                    getRefs('Word Size', 'aBWCopies',
                                            'aWSPencils', 'accessxcl'))

                        f.write('|=%s*%s*%s/2^10' %
                                getRefs('Word Size', 'aBWCopies', 'aWSCells'))

                        # working set size formulas (reuse only) (kB)

                        if array['stenciltype'] == 'faces':
                            f.write('|=%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))/2^10' % \
                                getRefs('Word Size', 'aBWCopies', \
                                        'aWSReusePlanes', 'blockxcl', 'blocky', \
                                        'blockxcl', 'accessy', \
                                        'accessxcl', 'blocky', \
                                        'blockxcl', 'blocky'))
                        else:
                            f.write('|=%s*%s*%s*%s*%s/2^10' % \
                                getRefs('Word Size', 'aBWCopies', 'aWSReusePlanes', 'accessxcl', 'accessy'))

                        if array['stenciltype'] == 'faces':
                            f.write('|=%s*%s*((%s-1)*(%s)+1*(%s))/2^10' % \
                                getRefs('Word Size', 'aBWCopies', 'aWSReusePencils', 'blockxcl', 'accessxcl'))
                        else:
                            f.write('|=%s*%s*%s*%s/2^10' %
                                    getRefs('Word Size', 'aBWCopies',
                                            'aWSReusePencils', 'accessxcl'))

                        f.write(
                            '|=%s*%s*%s/2^10' %
                            getRefs('Word Size', 'aBWCopies', 'aWSReuseCells'))

                        # memory traffic formulas (GBytes per sweep)
                        if array['stenciltype'] == 'faces':
                            f.write('|=%s*%s*%s*(%s*%s*%s+%s*%s*%s+%s*%s*%s-2*%s*%s*%s)/2^30' % \
                                getRefs('numBlocks', 'aBWCopies', 'R cost', 'accessxcl', 'blocky', 'blockz', 'blockxcl', \
                                        'accessy', 'blockz', 'blockxcl', 'blocky', 'accessz', 'blockxcl', 'blocky', 'blockz'))
                            f.write('|=%s*%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))*%s/2^30' % \
                                getRefs('numBlocks', 'aBWCopies', 'R cost', \
                                        'aBWPlanes', 'blockxcl', 'blocky', \
                                        'accessxcl', 'blocky', \
                                        'blockxcl', 'accessy', \
                                        'blockxcl', 'blocky', \
                                        'blockz'))
                            f.write('|=%s*%s*%s*((%s-1)*%s+1*%s)*%s*%s/2^30' % \
                                getRefs('numBlocks', 'aBWCopies', 'R cost', \
                                        'aBWPencils', 'blockxcl', \
                                        'accessxcl', \
                                        'blocky', 'blockz'))
                        else:
                            f.write('|=%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'R cost', \
                                                                         'accessxcl', 'accessy', 'accessz'))
                            f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWPlanes', 'R cost', \
                                                                            'accessxcl', 'accessy', 'blockz'))
                            f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWPencils', 'R cost', \
                                                                            'accessxcl', 'blocky', 'blockz'))
                        f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWCells', 'R cost', \
                                                                        'blockx', 'blocky', 'blockz'))

                        f.write('\n')

                    def addFname(*args):
                        return tuple(map(lambda x: '%s.%s' % (loopid, x),
                                         args))

                    temp = '|=sum(%%s:%%s)#$ELABEL{%s}' * 10 % \
                           addFname('WSPlane', 'WSPencil', 'WSCell', \
                                    'WSReusePlane', 'WSReusePencil', 'WSReuseCell', \
                                    'blockGbytes', 'planeGbytes', 'pencilGbytes', 'cellGbytes')

                    f.write('||Total' + '|' * 22 + temp % (getRefs(
                        (-max(len(readArrays), 1), 0), (-1, 0)) * 10))
                    f.write('\n')

                f.write('\n')
Exemplo n.º 7
0
        def writePWSummary(f):
            # write loop summary header with column labels
            f.write('WOODWARD LOOP ANALYSIS\n')
            f.write(
                '|||Iteration Space|||Block Iteration Space|||||Flops/cell/sweep||||Registers||Num Blocks||||Working Set|Bandwidth||||Computation (Gflops)|||Estimated execution times (s)\n'
            )
            f.write('function|line|sweeps#$CLABEL{sweeps}|' + \
                    'X#$CLABEL{iterx}|Y#$CLABEL{itery}|Z#$CLABEL{iterz}|' + \
                    'X#$CLABEL{blockx}|Y#$CLABEL{blocky}|Z#$CLABEL{blockz}|' + \
                    'X (cache line)#$CLABEL{blockxcl}|Num Blocks#$CLABEL{numBlocks}|' + \
                    'add#$CLABEL{add}|mul#$CLABEL{mul}|div#$CLABEL{div}|special#$CLABEL{spec}|GP Regs|FP Regs|' + \
                    'Resident#$CLABEL{ResidentBlocks}|' + \
                    'R#$CLABEL{RBlocks}|' + \
                    'W#$CLABEL{WBlocks}|' + \
                    'RW#$CLABEL{RWBlocks}|' + \
                    'kB#$CLABEL{WSBlock}|' + \
                    'R (GB)#$CLABEL{BWBlockR}|' + \
                    'W (GB)#$CLABEL{BWBlockW}|' + \
                    'RW (GB)#$CLABEL{BWBlockRW}|' + \
                    'Total (GB)#$CLABEL{BWBlock}|' + \
                    'Gflops performed#$CLABEL{gflopsPW}|Weighted Gflops#$CLABEL{wGflopsPW}|B/F ratio|' + \
                    'time (CPU)#$CLABEL{timeCPUPW}|time (DRAM)#$CLABEL{timeRAMPW}|time (CPU and DRAM)|\n')

            # write summary info
            for fname in sorted(self.info.keys()):
                function = self.info[fname]
                for loop in function['loops']:
                    flops = loop['flops']
                    loopid = '%s.%d' % (fname, loop['linenum'])

                    f.write(
                        '%s|%d|=%s' %
                        (fname, loop['linenum'], doRefSubs(loop['sweeps'])))

                    # iteration space
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][0])))
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][1])))
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][2])))

                    # expanded block iteration space with overlapped ghost regions
                    f.write('|=%s+(%s-%s)' %
                            getRefs('X Block Size', 'iterx', 'X Problem Size'))
                    f.write('|=%s+(%s-%s)' %
                            getRefs('Y Block Size', 'itery', 'Y Problem Size'))
                    f.write('|=%s+(%s-%s)' %
                            getRefs('Z Block Size', 'iterz', 'Z Problem Size'))

                    # block X rounded up to nearest cache line and num blocks
                    f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \
                                                                                  'Cache Line Size', 'Word Size'))
                    f.write('|=%s/%s*%s/%s*%s/%s' % getRefs('X Problem Size', 'X Block Size', \
                                                            'Y Problem Size', 'Y Block Size', \
                                                            'Z Problem Size', 'Z Block Size'))

                    # flops
                    f.write('|=%s|=%s|=%s|=%s' % tuple(map(lambda x: doRefSubs(flops[x]), \
                                                           ['adds', 'multiplies', 'divides', 'specials'])))

                    # registers
                    f.write('|=%s|=%s' % (doRefSubs(loop['registers']['ints'] + loop['registers']['ptrs']), \
                                          doRefSubs(loop['registers']['floats'])))

                    # number of blocks resident, read, and written
                    f.write('|=%s'*4 % tuple(map(doRefSubs, [loop['WS']['numBlocks'], loop['BW']['numBlocks']['R'], \
                                                             loop['BW']['numBlocks']['W'], loop['BW']['numBlocks']['RW']])))

                    # working set and bandwidth estimates
                    f.write('|=%s*(%s)/2^10' %
                            (getRef('Word Size'),
                             doRefSubs(loop['WS']['sizeBlocks'])))
                    f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \
                      (getRefs('WSBlock', '$/thread group (kB)', 'R cost', 'numBlocks') + \
                      (doRefSubs(loop['BW']['sizeBlocks']['R']),)))
                    f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \
                      (getRefs('WSBlock', '$/thread group (kB)', 'W cost', 'numBlocks') + \
                      (doRefSubs(loop['BW']['sizeBlocks']['W']),)))
                    f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \
                      (getRefs('WSBlock', '$/thread group (kB)', 'RW cost', 'numBlocks') + \
                      (doRefSubs(loop['BW']['sizeBlocks']['RW']),)))
                    f.write('|=%s+%s+%s' %
                            getRefs('BWBlockR', 'BWBlockW', 'BWBlockRW'))

                    # Gflops and arithmetic intensity
                    f.write('|=%s*(%s+%s+%s+%s)*%s*%s*%s*%s/1e9' % getRefs('sweeps', 'add', 'mul', 'div', 'spec', \
                                                                        'blockx', 'blocky', 'blockz', 'numBlocks'))
                    f.write('|=%s*(%s+%s+%s*%s+%s*%s)*%s*%s*%s*%s/1e9' % \
                            getRefs('sweeps', 'add', 'mul', 'Division Cost', 'div', 'Special Cost', 'spec', \
                                    'blockx', 'blocky', 'blockz', 'numBlocks'))
                    f.write('|=(%s*2^30)/(%s*10^9)' %
                            getRefs('BWBlock', 'wGflopsPW'))

                    # estimated execution times
                    f.write('|=%s/(%s*%s)' %
                            getRefs('wGflopsPW', 'Gflop/s/thread', 'Threads'))
                    f.write('|=%s/(%s*%s)' %
                            getRefs('BWBlock', 'GB/s/thread', 'Threads'))
                    f.write('|=max(%s:%s)' % getRefs('timeCPUPW', 'timeRAMPW'))

                    f.write('\n')

            # totals
            numLoops = sum([len(func['loops']) for func in self.info.values()])
            f.write('Total/Max' + '|' * 20)
            f.write('|=max(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)))
            f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 6)
            f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('BWBlock', 'wGflopsPW'))
            f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3)

            f.write('\n\n')
Exemplo n.º 8
0
        def writeSummary(f):
            # write loop summary header with column labels
            f.write('LOOP ANALYSIS\n')
            f.write(
                '|||Iteration Space|||Block Iteration Space|||||Flops/cell/sweep||||Registers||RW/W Arrays||RW/W Working Set|||Working set (naive)|||Working set (streaming writes)|||Working set (reuse only)|||Working set (actual)|||Memory Traffic (GBytes)|||||Computation (Gflops)|||Estimated execution times (s)\n'
            )
            f.write('function|line|sweeps#$CLABEL{sweeps}|' + \
                    'X#$CLABEL{iterx}|Y#$CLABEL{itery}|Z#$CLABEL{iterz}|' + \
                    'X#$CLABEL{blockx}|Y#$CLABEL{blocky}|Z#$CLABEL{blockz}|' + \
                    'X (cache line)#$CLABEL{blockxcl}|Num Blocks#$CLABEL{numBlocks}|' + \
                    'add#$CLABEL{add}|mul#$CLABEL{mul}|div#$CLABEL{div}|special#$CLABEL{spec}|GP Regs|FP Regs|' + \
                    'RW Arrays#$CLABEL{RWArrays}|W Arrays#$CLABEL{WArrays}|' + \
                    'RW/W WS Planes#$CLABEL{WSWPlanes}|RW/W WS Pencils#$CLABEL{WSWPencils}|RW/W WS Cells#$CLABEL{WSWCells}|' + \
                    'WS/plane/core (kB)#$CLABEL{WSPlane}|' + \
                    'WS/pencil/core (kB)#$CLABEL{WSPencil}|' + \
                    'WS/cell/core (kB)#$CLABEL{WSCell}|' + \
                    'WS/plane/core (kB)#$CLABEL{WSStreamPlane}|' + \
                    'WS/pencil/core (kB)#$CLABEL{WSStreamPencil}|' + \
                    'WS/cell/core (kB)#$CLABEL{WSStreamCell}|' + \
                    'WS/plane/core (kB)#$CLABEL{WSReusePlane}|' + \
                    'WS/pencil/core (kB)#$CLABEL{WSReusePencil}|' + \
                    'WS/cell/core (kB)#$CLABEL{WSReuseCell}|' + \
                    'WS/plane/core (kB)#$CLABEL{WSActualPlane}|' + \
                    'WS/pencil/core (kB)#$CLABEL{WSActualPencil}|' + \
                    'WS/cell/core (kB)#$CLABEL{WSActualCell}|' + \
                    'Reuse between planes#$CLABEL{blockGbytes}|' + \
                    'Reuse between pencils#$CLABEL{planeGbytes}|' + \
                    'Reuse within pencils#$CLABEL{pencilGbytes}|' + \
                    'No reuse within pencils#$CLABEL{cellGbytes}|' + \
                    'Actual#$CLABEL{aGbytes}|' + \
                    'Gflops performed#$CLABEL{gflops}|Weighted Gflops#$CLABEL{wGflops}|B/F ratio|' + \
                    'time (CPU)#$CLABEL{timeCPU}|time (DRAM)#$CLABEL{timeRAM}|time (CPU and DRAM)|\n')

            # write summary info
            for fname in sorted(self.info.keys()):
                function = self.info[fname]
                for loop in function['loops']:
                    flops = loop['flops']
                    loopid = '%s.%d' % (fname, loop['linenum'])

                    f.write(
                        '%s|%d|=%s' %
                        (fname, loop['linenum'], doRefSubs(loop['sweeps'])))

                    # iteration space
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][0])))
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][1])))
                    f.write('|=%s' % doRefSubs(numIters(loop['ranges'][2])))

                    # block iteration space
                    f.write('|=%s' % getRefs('X Block Size'))
                    f.write('|=%s' % getRefs('Y Block Size'))
                    f.write('|=%s' % getRefs('Z Block Size'))

                    # block X rounded up to nearest cache line and num blocks
                    f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \
                                                                                  'Cache Line Size', 'Word Size'))
                    f.write('|=%s/%s*%s/%s*%s/%s' %
                            getRefs('iterx', 'blockx', 'itery', 'blocky',
                                    'iterz', 'blockz'))

                    # flops
                    f.write('|=%s|=%s|=%s|=%s' % tuple(map(lambda x: doRefSubs(flops[x]), \
                                                           ['adds', 'multiplies', 'divides', 'specials'])))

                    # registers
                    f.write('|=%s|=%s' % (doRefSubs(loop['registers']['ints'] + loop['registers']['ptrs']), \
                                          doRefSubs(loop['registers']['floats'])))

                    # read/write and write arrays
                    numArrays = loop['BW']['numArrays']
                    f.write('|=%s|=%s' % (doRefSubs(
                        numArrays['RW']), doRefSubs(numArrays['W'])))

                    # working set planes, pencils, and cells
                    numPlanes = loop['WS']['numPlanes']
                    numPencils = loop['WS']['numPencils']
                    numCells = loop['WS']['numCells']
                    f.write('|=%s' %
                            doRefSubs(numPlanes['RW'] + numPlanes['W']))
                    f.write('|=%s' %
                            doRefSubs(numPencils['RW'] + numPencils['W']))
                    f.write('|=%s' % doRefSubs(numCells['RW'] + numCells['W']))

                    # working set sizes (kB)
                    f.write('|=%s+%s*%s*%s*%s/2^10' %
                            getRefs('%s.WSPlane' % loopid, 'Word Size',
                                    'WSWPlanes', 'blockxcl', 'blocky'))
                    f.write('|=%s+%s*%s*%s/2^10' %
                            getRefs('%s.WSPencil' % loopid, 'Word Size',
                                    'WSWPencils', 'blockxcl'))
                    f.write(
                        '|=%s+%s*%s/2^10' %
                        getRefs('%s.WSCell' % loopid, 'Word Size', 'WSWCells'))
                    f.write('|=%s' % getRefs('%s.WSPlane' % loopid))
                    f.write('|=%s' % getRefs('%s.WSPencil' % loopid))
                    f.write('|=%s' % getRefs('%s.WSCell' % loopid))
                    f.write('|=%s' % getRefs('%s.WSReusePlane' % loopid))
                    f.write('|=%s' % getRefs('%s.WSReusePencil' % loopid))
                    f.write('|=%s' % getRefs('%s.WSReuseCell' % loopid))
                    f.write('|=IF(%s=%s,%s,IF(%s, %s, IF(%s, %s, %s)))' % \
                            getRefs('blockGbytes', 'planeGbytes', 'WSActualPencil', \
                                    'NTA Hints', 'WSReusePlane', 'Streaming Writes', 'WSStreamPlane', 'WSPlane'))
                    f.write('|=IF(%s=%s,%s,IF(%s, %s, IF(%s, %s, %s)))' % \
                            getRefs('planeGbytes', 'pencilGbytes', 'WSActualCell', \
                                    'NTA Hints', 'WSReusePencil', 'Streaming Writes', 'WSStreamPencil', 'WSPencil'))
                    f.write('|=IF(%s, %s, IF(%s, %s, %s))' % \
                            getRefs('NTA Hints', 'WSReuseCell', 'Streaming Writes', 'WSStreamCell', 'WSCell'))

                    # GBytes transferred (use results from detailed read-only array analysis)
                    f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \
                            getRefs('sweeps', '%s.blockGbytes' % loopid, \
                                    'RWArrays', 'RW cost', 'WArrays', 'W cost',
                                    'numBlocks', 'blockxcl', 'blocky', 'blockz'))
                    f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \
                            getRefs('sweeps', '%s.planeGbytes' % loopid, \
                                    'RWArrays', 'RW cost', 'WArrays', 'W cost',
                                    'numBlocks', 'blockxcl', 'blocky', 'blockz'))
                    f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \
                            getRefs('sweeps', '%s.pencilGbytes' % loopid, \
                                    'RWArrays', 'RW cost', 'WArrays', 'W cost',
                                    'numBlocks', 'blockxcl', 'blocky', 'blockz'))
                    f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \
                            getRefs('sweeps', '%s.cellGbytes' % loopid, \
                                    'RWArrays', 'RW cost', 'WArrays', 'W cost',
                                    'numBlocks', 'blockxcl', 'blocky', 'blockz'))
                    f.write('|=IF(%s<%s,MIN(%s:%s),IF(%s<%s,MIN(%s:%s),IF(%s<%s,MIN(%s:%s),%s)))' % \
                            getRefs('WSActualPlane', '$/thread group (kB)', 'blockGbytes', 'cellGbytes', \
                                    'WSActualPencil', '$/thread group (kB)', 'planeGbytes', 'cellGbytes', \
                                    'WSActualCell', '$/thread group (kB)', 'pencilGbytes', 'cellGbytes', \
                                    'cellGbytes'))

                    # Gflops and arithmetic intensity
                    f.write('|=%s*(%s+%s+%s+%s)*%s*%s*%s/1e9' % getRefs('sweeps', 'add', 'mul', 'div', 'spec', \
                                                                        'iterx', 'itery', 'iterz'))
                    f.write('|=%s*(%s+%s+%s*%s+%s*%s)*%s*%s*%s/1e9' % \
                            getRefs('sweeps', 'add', 'mul', 'Division Cost', 'div', 'Special Cost', 'spec', \
                                    'iterx', 'itery', 'iterz'))
                    f.write('|=(%s*2^30)/(%s*10^9)' %
                            getRefs('aGbytes', 'wGflops'))

                    # estimated execution times
                    f.write('|=%s/(%s*%s)' %
                            getRefs('wGflops', 'Gflop/s/thread', 'Threads'))
                    f.write('|=%s/(%s*%s)' %
                            getRefs('aGbytes', 'GB/s/thread', 'Threads'))
                    f.write('|=max(%s:%s)' % getRefs('timeCPU', 'timeRAM'))

                    f.write('\n')

            # totals
            numLoops = sum([len(func['loops']) for func in self.info.values()])
            f.write('Total/Max' + '|' * 30)
            f.write('|=max(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3)
            f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 7)
            f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('aGbytes', 'wGflops'))
            f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3)

            f.write('\n\n')