def runPWLoopModel(linfo, my_params): result = {} result['linenum'] = linfo['linenum'] numSweeps = doParamSubs(linfo['sweeps'], my_params) result['range'] = (doParamSubs(numIters(linfo['ranges'][0]), my_params), \ doParamSubs(numIters(linfo['ranges'][1]), my_params), \ doParamSubs(numIters(linfo['ranges'][2]), my_params)) blockx = my_params['X Block Size'] + (result['range'][0] - my_params['X Problem Size']) blocky = my_params['Y Block Size'] + (result['range'][1] - my_params['Y Problem Size']) blockz = my_params['Z Block Size'] + (result['range'][2] - my_params['Z Problem Size']) result['block'] = [blockx, blocky, blockz] numBlocks = (my_params['X Problem Size'] * my_params['Y Problem Size'] * my_params['Z Problem Size']) / \ (my_params['X Block Size'] * my_params['Y Block Size'] * my_params['Z Block Size']) result['numBlocks'] = numBlocks # registers result['GPRegs'] = doParamSubs( linfo['registers']['ints'] + linfo['registers']['ptrs'], my_params) result['FPRegs'] = doParamSubs(linfo['registers']['floats'], my_params) result['regAlloc'] = regAllocModel(linfo, my_params) # save working set and memory traffic result['WSFinal'] = doParamSubs( my_params['Word Size'] * linfo['WS']['sizeBlocks'], my_params) if result['WSFinal'] <= my_params['$/thread group (kB)'] * 2**10: result['BWFinal'] = numBlocks * doParamSubs(my_params[ 'R cost'] * linfo['BW']['sizeBlocks']['R'] + \ my_params[ 'W cost'] * linfo['BW']['sizeBlocks']['W'] + \ my_params['RW cost'] * linfo['BW']['sizeBlocks']['RW'], my_params) else: # if not enough cache, punt on this method result['BWFinal'] = float('inf') # number of flops and weighted flops numCellIters = numSweeps * numBlocks * prod(result['block']) result['adds'] = numCellIters * doParamSubs( linfo['flops']['adds'], my_params) result['multiplies'] = numCellIters * doParamSubs( linfo['flops']['multiplies'], my_params) result['divides'] = numCellIters * doParamSubs( linfo['flops']['divides'], my_params) result['specials'] = numCellIters * doParamSubs( linfo['flops']['specials'], my_params) result['flops'] = result['adds'] + result['multiplies'] + result[ 'divides'] + result['specials'] result['wflops'] = result['adds'] + result['multiplies'] + my_params['Division Cost'] * result['divides'] + \ my_params['Special Cost'] * result['specials'] # arithmetic intensity if result['wflops'] != 0: result['BF'] = float(result['BWFinal']) / result['wflops'] else: result['BF'] = float('nan') # execution time result['cputime'] = float(result['wflops']) / \ (my_params['Gflop/s/thread'] * my_params['Threads'] * 10**9) result['ramtime'] = float(result['BWFinal']) / \ (my_params['GB/s/thread'] * my_params['Threads'] * 2**30) # assume perfect overlap if result['cputime'] > result['ramtime']: result['ramtime'] = 0 else: result['cputime'] = 0 result['time'] = max(result['cputime'], result['ramtime']) return result
def iter_n(self): return numIters(self.range) / self.stride
def runLoopModel(linfo, my_params): result = {} result['linenum'] = linfo['linenum'] result['range'] = (doParamSubs(numIters(linfo['ranges'][0]), my_params), \ doParamSubs(numIters(linfo['ranges'][1]), my_params), \ doParamSubs(numIters(linfo['ranges'][2]), my_params)) blockx = my_params['X Block Size'] blocky = my_params['Y Block Size'] blockz = my_params['Z Block Size'] result['block'] = [blockx, blocky, blockz] result['numBlocks'] = float(prod(result['range'])) / prod( result['block']) # round blockx up to nearest cache line multiple CLWords = my_params['Cache Line Size'] / my_params['Word Size'] blockx = math.ceil(float(blockx) / CLWords) * CLWords # registers result['GPRegs'] = doParamSubs( linfo['registers']['ints'] + linfo['registers']['ptrs'], my_params) result['FPRegs'] = doParamSubs(linfo['registers']['floats'], my_params) result['regAlloc'] = regAllocModel(linfo, my_params) # Compute working sets and memory traffic for read-only arrays arrays = [] for ainfo in analyze.getR(linfo['arrays']): array = {} array['name'] = ainfo['name'] array['access'] = map(lambda x, y: x + diff(y)[0], result['block'], ainfo['ghost']) accessx = array['access'][0] accessy = array['access'][1] accessz = array['access'][2] # round accessx up to nearest cache line multiple accessx = math.ceil(float(accessx) / CLWords) * CLWords # WS calculation for generic case array['WS'] = {'all' : {'plane' : my_params['Word Size'] * ainfo['WS']['numPlanes'] * \ accessx * accessy, \ 'pencil': my_params['Word Size'] * ainfo['WS']['numPencils'] * \ accessx, \ 'cell' : my_params['Word Size'] * ainfo['WS']['numCells'], \ }, \ 'reuse': {'plane' : my_params['Word Size'] * ainfo['WS']['numReusePlanes'] * \ accessx * accessy, \ 'pencil': my_params['Word Size'] * ainfo['WS']['numReusePencils'] * \ accessx, \ 'cell' : my_params['Word Size'] * ainfo['WS']['numReuseCells'], \ }, \ } # fix the plane WS for faces-only stencils if ainfo['stenciltype'] == 'faces': array['WS']['all']['plane'] = my_params['Word Size'] * \ ((ainfo['WS']['numPlanes'] - 1) * (blockx * blocky) + \ 1 * (blockx * accessy + accessx * blocky - blockx * blocky)) array['WS']['all']['pencil'] = my_params['Word Size'] * \ ((ainfo['WS']['numPencils'] - 1) * blockx + 1 * accessx) array['WS']['reuse']['plane'] = my_params['Word Size'] * \ ((ainfo['WS']['numReusePlanes'] - 1) * (blockx * blocky) + \ 1 * (blockx * accessy + accessx * blocky - blockx * blocky)) array['WS']['reuse']['pencil'] = my_params['Word Size'] * \ ((ainfo['WS']['numReusePencils'] - 1) * blockx + 1 * accessx) # BW calculation for generic case array['BW'] = {'block' : result['numBlocks'] * ainfo['BW']['numCopies'] * my_params['R cost'] * \ accessx * accessy * accessz, \ 'plane' : result['numBlocks'] * ainfo['BW']['numPlanes'] * my_params['R cost'] * \ accessx * accessy * blockz, \ 'pencil': result['numBlocks'] * ainfo['BW']['numPencils'] * my_params['R cost'] * \ accessx * blocky * blockz, \ 'cell' : result['numBlocks'] * ainfo['BW']['numCells'] * my_params['R cost'] * \ blockx * blocky * blockz, \ } # fix the block and plane BW for faces-only stencils if ainfo['stenciltype'] == 'faces': array['BW']['block'] = result['numBlocks'] * ainfo['BW']['numCopies'] * my_params['R cost'] * \ (accessx * blocky * blockz + blockx * accessy * blockz + \ blockx * blocky * accessz - 2 * blockx * blocky * blockz) array['BW']['plane'] = result['numBlocks'] * my_params['R cost'] * \ ((ainfo['BW']['numPlanes'] - 1) * (blockx * blocky) + \ 1 * (accessx * blocky + blockx * accessy - blockx * blocky)) * blockz array['BW']['pencil'] = result['numBlocks'] * my_params['R cost'] * \ ((ainfo['BW']['numPencils'] - 1) * blockx + 1 * accessx) * blocky * blockz arrays.append(array) sumWSAllPlane = sum(map(lambda x: x['WS']['all']['plane'], arrays)) sumWSAllPencil = sum( map(lambda x: x['WS']['all']['pencil'], arrays)) sumWSAllCell = sum(map(lambda x: x['WS']['all']['cell'], arrays)) sumWSReusePlane = sum( map(lambda x: x['WS']['reuse']['plane'], arrays)) sumWSReusePencil = sum( map(lambda x: x['WS']['reuse']['pencil'], arrays)) sumWSReuseCell = sum( map(lambda x: x['WS']['reuse']['cell'], arrays)) sumBWBlock = sum(map(lambda x: x['BW']['block'], arrays)) sumBWPlane = sum(map(lambda x: x['BW']['plane'], arrays)) sumBWPencil = sum(map(lambda x: x['BW']['pencil'], arrays)) sumBWCell = sum(map(lambda x: x['BW']['cell'], arrays)) result['arrays'] = arrays # Compute working sets for different scenarios result['WS'] = {'all' : {'plane' : sumWSAllPlane + my_params['Word Size'] * \ (linfo['WS']['numPlanes']['RW'] + linfo['WS']['numPlanes']['W']) * \ blockx * blocky, \ 'pencil': sumWSAllPencil + my_params['Word Size'] * \ (linfo['WS']['numPencils']['RW'] + linfo['WS']['numPencils']['W']) * \ blockx, \ 'cell' : sumWSAllCell + my_params['Word Size'] * \ (linfo['WS']['numCells']['RW'] + linfo['WS']['numCells']['W']), \ }, \ 'stream': {'plane' : sumWSAllPlane, \ 'pencil': sumWSAllPencil, \ 'cell' : sumWSAllCell, \ }, \ 'reuse' : {'plane' : sumWSReusePlane, \ 'pencil': sumWSReusePencil, \ 'cell' : sumWSReuseCell, \ }, \ } # Determine actual working sets based on cache utilization policy if my_params['NTA Hints']: result['WS']['actual'] = result['WS']['reuse'] elif my_params['Streaming Writes']: result['WS']['actual'] = result['WS']['stream'] else: result['WS']['actual'] = result['WS']['all'] # Compute memory traffic for different reuse scenarios numSweeps = doParamSubs(linfo['sweeps'], my_params) RWWBW = (linfo['BW']['numArrays']['RW'] * my_params['RW cost'] + \ linfo['BW']['numArrays']['W' ] * my_params['W cost' ]) * result['numBlocks'] * blockx * blocky * blockz result['BW'] = {'block' : numSweeps * (sumBWBlock + RWWBW), \ 'plane' : numSweeps * (sumBWPlane + RWWBW), \ 'pencil': numSweeps * (sumBWPencil + RWWBW), \ 'cell' : numSweeps * (sumBWCell + RWWBW), \ } # do symbolic parameter substitutions for x in result['WS'].values(): for (y, z) in x.iteritems(): x[y] = doParamSubs(z, my_params) for (y, z) in result['BW'].iteritems(): result['BW'][y] = doParamSubs(z, my_params) # bandwidth should be no worse than model prediction for cases with worse reuse, # but model approximations cause different inaccuracies for different cases result['BW']['pencil'] = min(result['BW']['pencil'], result['BW']['cell']) result['BW']['plane'] = min(result['BW']['plane'], result['BW']['pencil']) result['BW']['block'] = min(result['BW']['block'], result['BW']['plane']) # if there's no difference between memory traffic, working set is effectively reduced if result['BW']['pencil'] == result['BW']['cell']: result['WS']['actual']['cell'] = 0 if result['BW']['plane'] == result['BW']['pencil']: result['WS']['actual']['pencil'] = result['WS']['actual'][ 'cell'] if result['BW']['block'] == result['BW']['plane']: result['WS']['actual']['plane'] = result['WS']['actual'][ 'pencil'] # Compute "actual" memory traffic based on type of reuse given available cache if result['WS']['actual'][ 'plane'] <= my_params['$/thread group (kB)'] * 2**10: result['BW']['actual'] = result['BW']['block'] elif result['WS']['actual'][ 'pencil'] <= my_params['$/thread group (kB)'] * 2**10: result['BW']['actual'] = result['BW']['plane'] elif result['WS']['actual'][ 'cell'] <= my_params['$/thread group (kB)'] * 2**10: result['BW']['actual'] = result['BW']['pencil'] else: result['BW']['actual'] = result['BW']['cell'] # save final WS and BW result['WSFinal'] = result['WS']['actual']['plane'] result['BWFinal'] = result['BW']['actual'] # number of flops and weighted flops result['adds'] = numSweeps * prod(result['range']) * doParamSubs( linfo['flops']['adds'], my_params) result['multiplies'] = numSweeps * prod( result['range']) * doParamSubs(linfo['flops']['multiplies'], my_params) result['divides'] = numSweeps * prod( result['range']) * doParamSubs(linfo['flops']['divides'], my_params) result['specials'] = numSweeps * prod( result['range']) * doParamSubs(linfo['flops']['specials'], my_params) result['flops'] = result['adds'] + result['multiplies'] + result[ 'divides'] + result['specials'] result['wflops'] = result['adds'] + result['multiplies'] + my_params['Division Cost'] * result['divides'] + \ my_params['Special Cost'] * result['specials'] # arithmetic intensity if result['wflops'] != 0: result['BF'] = float(result['BWFinal']) / result['wflops'] else: result['BF'] = float('nan') # execution time result['cputime'] = float(result['wflops']) / \ (my_params['Gflop/s/thread'] * my_params['Threads'] * 10**9) result['ramtime'] = float(result['BW']['actual']) / \ (my_params['GB/s/thread'] * my_params['Threads'] * 2**30) # assume perfect overlap if result['cputime'] > result['ramtime']: result['ramtime'] = 0 else: result['cputime'] = 0 result['time'] = max(result['cputime'], result['ramtime']) return result
def mergeLoops(linfo1, linfo2): def mergeAccess(array1, array2): access1 = array1['access'] access2 = array2['access'] # HACK: if there's a numCopies mismatch, try to recognize if there's an implicit # knowledge of the number of elements in the array if array1['copies'] == 1 and array2['copies'] != 1: maxIndex = max(map(lambda x: x[0], access1.keys())) print >> sys.stderr, 'Warning: During merge of %s: assuming %s equals %s' % \ (array2['name'], array2['copies'], maxIndex) if access2.keys() != [(0, )]: raise Exception('unsupported access merge') for idx in xrange(1, maxIndex + 1): if (idx, ) in access1: access1[(idx, )]['reads'] += access2[( 0, )]['reads'] access1[(idx, )]['writes'] += access2[( 0, )]['writes'] else: access1[(idx, )] = access2[(0, )] else: for index in access2: if index in access1: access1[index]['reads'] += access2[index][ 'reads'] access1[index]['writes'] += access2[index][ 'writes'] else: access1[index] = access2[index] return access1 iters = numIters(linfo2['range']) if linfo2['loops']: raise Exception( 'child loop cannot have nested loops of its own') # process flops for floptype in ['adds', 'multiplies', 'divides', 'specials']: linfo1['flops'][floptype] = linfo1['flops'][ floptype] + iters * linfo2['flops'][floptype] # process scalars for scalar2 in linfo2['scalars']: scalar2['reads'] *= iters scalar2['writes'] *= iters scalar1 = getMember(linfo1['scalars'], scalar2['name']) if scalar1: scalar1['reads'] += scalar2['reads'] scalar1['writes'] += scalar2['writes'] else: linfo1['scalars'].append(scalar2) # process spatial arrays and state arrays for category in ['arrays', 'stateArrays']: for array2 in linfo2[category]: if category == 'arrays' and isSpeciesArray(array2['name']) or \ category == 'stateArrays' and array2['arraytype'] == 'relStateArray': array2['copies'] = iters * array2['copies'] else: for access in array2['access'].values(): access['reads'] *= iters access['writes'] *= iters array1 = getMember(linfo1[category], array2['name']) if array1: array1['access'] = mergeAccess(array1, array2) else: linfo1[category].append(array2)
def flattenLoopNest(linfo): def getRanges(linfo): result = [[linfo['loopvar']], [linfo['range']], [linfo['stride']]] if len(linfo['loops']) > 0: temp = getRanges(linfo['loops'][0]) result[0].extend(temp[0]) result[1].extend(temp[1]) result[2].extend(temp[2]) return result def isSpatial(x): return re.search('d?lo\(\d\)', x[0]) and \ re.search('d?hi\(\d\)', x[1]) # triply nested spatial loop def isType0(ranges): return len(ranges) == 3 and all(map(isSpatial, ranges)) # quadruply nested loop with spatial loops 2, 3, and 4 def isType1(ranges): return len(ranges) == 4 and all(map(isSpatial, ranges[1:])) # quadruply nested loop with spatial loops 1, 2, and 3 def isType2(ranges): return len(ranges) == 4 and all(map(isSpatial, ranges[:-1])) # merge info from child loop linfo2 into parent loop linfo1 def mergeLoops(linfo1, linfo2): def mergeAccess(array1, array2): access1 = array1['access'] access2 = array2['access'] # HACK: if there's a numCopies mismatch, try to recognize if there's an implicit # knowledge of the number of elements in the array if array1['copies'] == 1 and array2['copies'] != 1: maxIndex = max(map(lambda x: x[0], access1.keys())) print >> sys.stderr, 'Warning: During merge of %s: assuming %s equals %s' % \ (array2['name'], array2['copies'], maxIndex) if access2.keys() != [(0, )]: raise Exception('unsupported access merge') for idx in xrange(1, maxIndex + 1): if (idx, ) in access1: access1[(idx, )]['reads'] += access2[( 0, )]['reads'] access1[(idx, )]['writes'] += access2[( 0, )]['writes'] else: access1[(idx, )] = access2[(0, )] else: for index in access2: if index in access1: access1[index]['reads'] += access2[index][ 'reads'] access1[index]['writes'] += access2[index][ 'writes'] else: access1[index] = access2[index] return access1 iters = numIters(linfo2['range']) if linfo2['loops']: raise Exception( 'child loop cannot have nested loops of its own') # process flops for floptype in ['adds', 'multiplies', 'divides', 'specials']: linfo1['flops'][floptype] = linfo1['flops'][ floptype] + iters * linfo2['flops'][floptype] # process scalars for scalar2 in linfo2['scalars']: scalar2['reads'] *= iters scalar2['writes'] *= iters scalar1 = getMember(linfo1['scalars'], scalar2['name']) if scalar1: scalar1['reads'] += scalar2['reads'] scalar1['writes'] += scalar2['writes'] else: linfo1['scalars'].append(scalar2) # process spatial arrays and state arrays for category in ['arrays', 'stateArrays']: for array2 in linfo2[category]: if category == 'arrays' and isSpeciesArray(array2['name']) or \ category == 'stateArrays' and array2['arraytype'] == 'relStateArray': array2['copies'] = iters * array2['copies'] else: for access in array2['access'].values(): access['reads'] *= iters access['writes'] *= iters array1 = getMember(linfo1[category], array2['name']) if array1: array1['access'] = mergeAccess(array1, array2) else: linfo1[category].append(array2) [loopvars, ranges, strides] = getRanges(linfo) if not (isType0(ranges) or isType1(ranges) or isType2(ranges)): if options.flag_warn: print >> sys.stderr, "Warning: ignoring loop nest: %s" % str( ranges) return # HACK: figure out loop type based on ranges sweeps = 1 if isType1(ranges): # species sweep in outermost loop sweeps = numIters(linfo['range']) linfo = linfo['loops'][0] loopvars = loopvars[1:] ranges = ranges[1:] strides = strides[1:] elif isType2(ranges): # species sweep(s) inside third spatial loop level loopvars = loopvars[0:-1] ranges = ranges[0:-1] strides = strides[0:-1] linfo3 = linfo['loops'][0]['loops'][0] for linfo4 in linfo3['loops']: mergeLoops(linfo3, linfo4) del linfo3['loops'] # use data in innermost loop linfo['loops'][0]['loops'][0]['linenum'] = linfo['linenum'] linfo = linfo['loops'][0]['loops'][0] del linfo['loopvar'] del linfo['range'] del linfo['stride'] loopvars.reverse() ranges.reverse() strides.reverse() linfo['loopvars'] = loopvars linfo['ranges'] = ranges linfo['strides'] = strides linfo['sweeps'] = sweeps return linfo
def writeDetails(f): f.write('DETAILED READ-ONLY ARRAY INFO\n\n') for fname in sorted(self.info.keys()): f.write('Function:|%s\n' % (fname)) function = self.info[fname] for loop in function['loops']: # print header loopid = '%s.%d' % (fname, loop['linenum']) f.write('Loop line num:|%d\n' % (loop['linenum'])) f.write('|||Iteration Space|||Block Iteration Space|||||Block Access Space|||||Bandwidth|||' + \ 'Working Set|||Reuse WS|||WS (all reads)|||WS (reuse only)|||GBytes transferred/sweep\n') f.write('||Name|X|Y|Z|X|Y|Z|X (cache line)|Num Blocks|' + \ 'X#$CLABEL{accessx}|Y#$CLABEL{accessy}|Z#$CLABEL{accessz}|X (cache line)#$CLABEL{accessxcl}|' + \ 'Copies#$CLABEL{aBWCopies}|Planes#$CLABEL{aBWPlanes}|' + \ 'Pencils#$CLABEL{aBWPencils}|Cells#$CLABEL{aBWCells}|' + \ 'Planes#$CLABEL{aWSPlanes}|Pencils#$CLABEL{aWSPencils}|Cells#$CLABEL{aWSCells}|' + \ 'Planes#$CLABEL{aWSReusePlanes}|Pencils#$CLABEL{aWSReusePencils}|Cells#$CLABEL{aWSReuseCells}|' + \ 'WS/plane/core (kB)#$CLABEL{aWSPlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{aWSPencil}|' + \ 'WS/cell/core (kB)#$CLABEL{aWSCell}|' + \ 'WS/plane/core (kB)#$CLABEL{aWSReusePlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{aWSReusePencil}|' + \ 'WS/cell/core (kB)#$CLABEL{aWSReuseCell}|' + \ 'Reuse between planes#$CLABEL{aBlockGbytes}|' + \ 'Reuse between pencils#$CLABEL{aPlaneGbytes}|' + \ 'Reuse within pencils#$CLABEL{aPencilGbytes}|' + \ 'No reuse within pencils#$CLABEL{aCellGbytes}|' + \ '\n') readArrays = sorted(analyze.getR(loop['arrays']), key=lambda x: x['name']) for array in readArrays: # name f.write('||%s' % (array['name'])) # iteration space f.write('|=%s' % doRefSubs(numIters(loop['ranges'][0]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][1]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][2]))) # block iteration space (with also X rounded up to nearest cache line) and num blocks f.write('|=%s' % getRefs('X Block Size')) f.write('|=%s' % getRefs('Y Block Size')) f.write('|=%s' % getRefs('Z Block Size')) f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \ 'Cache Line Size', 'Word Size')) f.write('|=%s/%s*%s/%s*%s/%s' % getRefs('iterx', 'blockx', 'itery', 'blocky', 'iterz', 'blockz')) # block access space (with also X rounded up to nearest cache line) f.write('|=%s+%d' % (getRef('blockx'), diff(array['ghost'][0])[0])) f.write('|=%s+%d' % (getRef('blocky'), diff(array['ghost'][1])[0])) f.write('|=%s+%d' % (getRef('blockz'), diff(array['ghost'][2])[0])) f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('accessx', 'Cache Line Size', 'Word Size', \ 'Cache Line Size', 'Word Size')) # bandwidth and working set figures from access pattern analysis f.write('|=%s'*10 % tuple(map(doRefSubs, [array['copies'], array['BW']['numPlanes'], \ array['BW']['numPencils'], array['BW']['numCells'], array['WS']['numPlanes'], \ array['WS']['numPencils'], array['WS']['numCells'], array['WS']['numReusePlanes'], \ array['WS']['numReusePencils'], array['WS']['numReuseCells']]))) # working set size formulas (kB) if array['stenciltype'] == 'faces': f.write('|=%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))/2^10' % \ getRefs('Word Size', 'aBWCopies', \ 'aWSPlanes', 'blockxcl', 'blocky', \ 'blockxcl', 'accessy', \ 'accessxcl', 'blocky', \ 'blockxcl', 'blocky')) else: f.write('|=%s*%s*%s*%s*%s/2^10' % \ getRefs('Word Size', 'aBWCopies', 'aWSPlanes', 'accessxcl', 'accessy')) if array['stenciltype'] == 'faces': f.write('|=%s*%s*((%s-1)*(%s)+1*(%s))/2^10' % \ getRefs('Word Size', 'aBWCopies', 'aWSPencils', 'blockxcl', 'accessxcl')) else: f.write('|=%s*%s*%s*%s/2^10' % getRefs('Word Size', 'aBWCopies', 'aWSPencils', 'accessxcl')) f.write('|=%s*%s*%s/2^10' % getRefs('Word Size', 'aBWCopies', 'aWSCells')) # working set size formulas (reuse only) (kB) if array['stenciltype'] == 'faces': f.write('|=%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))/2^10' % \ getRefs('Word Size', 'aBWCopies', \ 'aWSReusePlanes', 'blockxcl', 'blocky', \ 'blockxcl', 'accessy', \ 'accessxcl', 'blocky', \ 'blockxcl', 'blocky')) else: f.write('|=%s*%s*%s*%s*%s/2^10' % \ getRefs('Word Size', 'aBWCopies', 'aWSReusePlanes', 'accessxcl', 'accessy')) if array['stenciltype'] == 'faces': f.write('|=%s*%s*((%s-1)*(%s)+1*(%s))/2^10' % \ getRefs('Word Size', 'aBWCopies', 'aWSReusePencils', 'blockxcl', 'accessxcl')) else: f.write('|=%s*%s*%s*%s/2^10' % getRefs('Word Size', 'aBWCopies', 'aWSReusePencils', 'accessxcl')) f.write( '|=%s*%s*%s/2^10' % getRefs('Word Size', 'aBWCopies', 'aWSReuseCells')) # memory traffic formulas (GBytes per sweep) if array['stenciltype'] == 'faces': f.write('|=%s*%s*%s*(%s*%s*%s+%s*%s*%s+%s*%s*%s-2*%s*%s*%s)/2^30' % \ getRefs('numBlocks', 'aBWCopies', 'R cost', 'accessxcl', 'blocky', 'blockz', 'blockxcl', \ 'accessy', 'blockz', 'blockxcl', 'blocky', 'accessz', 'blockxcl', 'blocky', 'blockz')) f.write('|=%s*%s*%s*((%s-1)*(%s*%s)+1*(%s*%s+%s*%s-%s*%s))*%s/2^30' % \ getRefs('numBlocks', 'aBWCopies', 'R cost', \ 'aBWPlanes', 'blockxcl', 'blocky', \ 'accessxcl', 'blocky', \ 'blockxcl', 'accessy', \ 'blockxcl', 'blocky', \ 'blockz')) f.write('|=%s*%s*%s*((%s-1)*%s+1*%s)*%s*%s/2^30' % \ getRefs('numBlocks', 'aBWCopies', 'R cost', \ 'aBWPencils', 'blockxcl', \ 'accessxcl', \ 'blocky', 'blockz')) else: f.write('|=%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'R cost', \ 'accessxcl', 'accessy', 'accessz')) f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWPlanes', 'R cost', \ 'accessxcl', 'accessy', 'blockz')) f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWPencils', 'R cost', \ 'accessxcl', 'blocky', 'blockz')) f.write('|=%s*%s*%s*%s*%s*%s*%s/2^30' % getRefs('numBlocks', 'aBWCopies', 'aBWCells', 'R cost', \ 'blockx', 'blocky', 'blockz')) f.write('\n') def addFname(*args): return tuple(map(lambda x: '%s.%s' % (loopid, x), args)) temp = '|=sum(%%s:%%s)#$ELABEL{%s}' * 10 % \ addFname('WSPlane', 'WSPencil', 'WSCell', \ 'WSReusePlane', 'WSReusePencil', 'WSReuseCell', \ 'blockGbytes', 'planeGbytes', 'pencilGbytes', 'cellGbytes') f.write('||Total' + '|' * 22 + temp % (getRefs( (-max(len(readArrays), 1), 0), (-1, 0)) * 10)) f.write('\n') f.write('\n')
def writePWSummary(f): # write loop summary header with column labels f.write('WOODWARD LOOP ANALYSIS\n') f.write( '|||Iteration Space|||Block Iteration Space|||||Flops/cell/sweep||||Registers||Num Blocks||||Working Set|Bandwidth||||Computation (Gflops)|||Estimated execution times (s)\n' ) f.write('function|line|sweeps#$CLABEL{sweeps}|' + \ 'X#$CLABEL{iterx}|Y#$CLABEL{itery}|Z#$CLABEL{iterz}|' + \ 'X#$CLABEL{blockx}|Y#$CLABEL{blocky}|Z#$CLABEL{blockz}|' + \ 'X (cache line)#$CLABEL{blockxcl}|Num Blocks#$CLABEL{numBlocks}|' + \ 'add#$CLABEL{add}|mul#$CLABEL{mul}|div#$CLABEL{div}|special#$CLABEL{spec}|GP Regs|FP Regs|' + \ 'Resident#$CLABEL{ResidentBlocks}|' + \ 'R#$CLABEL{RBlocks}|' + \ 'W#$CLABEL{WBlocks}|' + \ 'RW#$CLABEL{RWBlocks}|' + \ 'kB#$CLABEL{WSBlock}|' + \ 'R (GB)#$CLABEL{BWBlockR}|' + \ 'W (GB)#$CLABEL{BWBlockW}|' + \ 'RW (GB)#$CLABEL{BWBlockRW}|' + \ 'Total (GB)#$CLABEL{BWBlock}|' + \ 'Gflops performed#$CLABEL{gflopsPW}|Weighted Gflops#$CLABEL{wGflopsPW}|B/F ratio|' + \ 'time (CPU)#$CLABEL{timeCPUPW}|time (DRAM)#$CLABEL{timeRAMPW}|time (CPU and DRAM)|\n') # write summary info for fname in sorted(self.info.keys()): function = self.info[fname] for loop in function['loops']: flops = loop['flops'] loopid = '%s.%d' % (fname, loop['linenum']) f.write( '%s|%d|=%s' % (fname, loop['linenum'], doRefSubs(loop['sweeps']))) # iteration space f.write('|=%s' % doRefSubs(numIters(loop['ranges'][0]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][1]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][2]))) # expanded block iteration space with overlapped ghost regions f.write('|=%s+(%s-%s)' % getRefs('X Block Size', 'iterx', 'X Problem Size')) f.write('|=%s+(%s-%s)' % getRefs('Y Block Size', 'itery', 'Y Problem Size')) f.write('|=%s+(%s-%s)' % getRefs('Z Block Size', 'iterz', 'Z Problem Size')) # block X rounded up to nearest cache line and num blocks f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \ 'Cache Line Size', 'Word Size')) f.write('|=%s/%s*%s/%s*%s/%s' % getRefs('X Problem Size', 'X Block Size', \ 'Y Problem Size', 'Y Block Size', \ 'Z Problem Size', 'Z Block Size')) # flops f.write('|=%s|=%s|=%s|=%s' % tuple(map(lambda x: doRefSubs(flops[x]), \ ['adds', 'multiplies', 'divides', 'specials']))) # registers f.write('|=%s|=%s' % (doRefSubs(loop['registers']['ints'] + loop['registers']['ptrs']), \ doRefSubs(loop['registers']['floats']))) # number of blocks resident, read, and written f.write('|=%s'*4 % tuple(map(doRefSubs, [loop['WS']['numBlocks'], loop['BW']['numBlocks']['R'], \ loop['BW']['numBlocks']['W'], loop['BW']['numBlocks']['RW']]))) # working set and bandwidth estimates f.write('|=%s*(%s)/2^10' % (getRef('Word Size'), doRefSubs(loop['WS']['sizeBlocks']))) f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \ (getRefs('WSBlock', '$/thread group (kB)', 'R cost', 'numBlocks') + \ (doRefSubs(loop['BW']['sizeBlocks']['R']),))) f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \ (getRefs('WSBlock', '$/thread group (kB)', 'W cost', 'numBlocks') + \ (doRefSubs(loop['BW']['sizeBlocks']['W']),))) f.write('|=IF(%s<%s, %s*%s*(%s)/2^30, "N/A")' % \ (getRefs('WSBlock', '$/thread group (kB)', 'RW cost', 'numBlocks') + \ (doRefSubs(loop['BW']['sizeBlocks']['RW']),))) f.write('|=%s+%s+%s' % getRefs('BWBlockR', 'BWBlockW', 'BWBlockRW')) # Gflops and arithmetic intensity f.write('|=%s*(%s+%s+%s+%s)*%s*%s*%s*%s/1e9' % getRefs('sweeps', 'add', 'mul', 'div', 'spec', \ 'blockx', 'blocky', 'blockz', 'numBlocks')) f.write('|=%s*(%s+%s+%s*%s+%s*%s)*%s*%s*%s*%s/1e9' % \ getRefs('sweeps', 'add', 'mul', 'Division Cost', 'div', 'Special Cost', 'spec', \ 'blockx', 'blocky', 'blockz', 'numBlocks')) f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('BWBlock', 'wGflopsPW')) # estimated execution times f.write('|=%s/(%s*%s)' % getRefs('wGflopsPW', 'Gflop/s/thread', 'Threads')) f.write('|=%s/(%s*%s)' % getRefs('BWBlock', 'GB/s/thread', 'Threads')) f.write('|=max(%s:%s)' % getRefs('timeCPUPW', 'timeRAMPW')) f.write('\n') # totals numLoops = sum([len(func['loops']) for func in self.info.values()]) f.write('Total/Max' + '|' * 20) f.write('|=max(%s:%s)' % getRefs((-numLoops, 0), (-1, 0))) f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 6) f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('BWBlock', 'wGflopsPW')) f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3) f.write('\n\n')
def writeSummary(f): # write loop summary header with column labels f.write('LOOP ANALYSIS\n') f.write( '|||Iteration Space|||Block Iteration Space|||||Flops/cell/sweep||||Registers||RW/W Arrays||RW/W Working Set|||Working set (naive)|||Working set (streaming writes)|||Working set (reuse only)|||Working set (actual)|||Memory Traffic (GBytes)|||||Computation (Gflops)|||Estimated execution times (s)\n' ) f.write('function|line|sweeps#$CLABEL{sweeps}|' + \ 'X#$CLABEL{iterx}|Y#$CLABEL{itery}|Z#$CLABEL{iterz}|' + \ 'X#$CLABEL{blockx}|Y#$CLABEL{blocky}|Z#$CLABEL{blockz}|' + \ 'X (cache line)#$CLABEL{blockxcl}|Num Blocks#$CLABEL{numBlocks}|' + \ 'add#$CLABEL{add}|mul#$CLABEL{mul}|div#$CLABEL{div}|special#$CLABEL{spec}|GP Regs|FP Regs|' + \ 'RW Arrays#$CLABEL{RWArrays}|W Arrays#$CLABEL{WArrays}|' + \ 'RW/W WS Planes#$CLABEL{WSWPlanes}|RW/W WS Pencils#$CLABEL{WSWPencils}|RW/W WS Cells#$CLABEL{WSWCells}|' + \ 'WS/plane/core (kB)#$CLABEL{WSPlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{WSPencil}|' + \ 'WS/cell/core (kB)#$CLABEL{WSCell}|' + \ 'WS/plane/core (kB)#$CLABEL{WSStreamPlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{WSStreamPencil}|' + \ 'WS/cell/core (kB)#$CLABEL{WSStreamCell}|' + \ 'WS/plane/core (kB)#$CLABEL{WSReusePlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{WSReusePencil}|' + \ 'WS/cell/core (kB)#$CLABEL{WSReuseCell}|' + \ 'WS/plane/core (kB)#$CLABEL{WSActualPlane}|' + \ 'WS/pencil/core (kB)#$CLABEL{WSActualPencil}|' + \ 'WS/cell/core (kB)#$CLABEL{WSActualCell}|' + \ 'Reuse between planes#$CLABEL{blockGbytes}|' + \ 'Reuse between pencils#$CLABEL{planeGbytes}|' + \ 'Reuse within pencils#$CLABEL{pencilGbytes}|' + \ 'No reuse within pencils#$CLABEL{cellGbytes}|' + \ 'Actual#$CLABEL{aGbytes}|' + \ 'Gflops performed#$CLABEL{gflops}|Weighted Gflops#$CLABEL{wGflops}|B/F ratio|' + \ 'time (CPU)#$CLABEL{timeCPU}|time (DRAM)#$CLABEL{timeRAM}|time (CPU and DRAM)|\n') # write summary info for fname in sorted(self.info.keys()): function = self.info[fname] for loop in function['loops']: flops = loop['flops'] loopid = '%s.%d' % (fname, loop['linenum']) f.write( '%s|%d|=%s' % (fname, loop['linenum'], doRefSubs(loop['sweeps']))) # iteration space f.write('|=%s' % doRefSubs(numIters(loop['ranges'][0]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][1]))) f.write('|=%s' % doRefSubs(numIters(loop['ranges'][2]))) # block iteration space f.write('|=%s' % getRefs('X Block Size')) f.write('|=%s' % getRefs('Y Block Size')) f.write('|=%s' % getRefs('Z Block Size')) # block X rounded up to nearest cache line and num blocks f.write('|=ceiling(%s/(%s/%s),1)*(%s/%s)' % getRefs('blockx', 'Cache Line Size', 'Word Size', \ 'Cache Line Size', 'Word Size')) f.write('|=%s/%s*%s/%s*%s/%s' % getRefs('iterx', 'blockx', 'itery', 'blocky', 'iterz', 'blockz')) # flops f.write('|=%s|=%s|=%s|=%s' % tuple(map(lambda x: doRefSubs(flops[x]), \ ['adds', 'multiplies', 'divides', 'specials']))) # registers f.write('|=%s|=%s' % (doRefSubs(loop['registers']['ints'] + loop['registers']['ptrs']), \ doRefSubs(loop['registers']['floats']))) # read/write and write arrays numArrays = loop['BW']['numArrays'] f.write('|=%s|=%s' % (doRefSubs( numArrays['RW']), doRefSubs(numArrays['W']))) # working set planes, pencils, and cells numPlanes = loop['WS']['numPlanes'] numPencils = loop['WS']['numPencils'] numCells = loop['WS']['numCells'] f.write('|=%s' % doRefSubs(numPlanes['RW'] + numPlanes['W'])) f.write('|=%s' % doRefSubs(numPencils['RW'] + numPencils['W'])) f.write('|=%s' % doRefSubs(numCells['RW'] + numCells['W'])) # working set sizes (kB) f.write('|=%s+%s*%s*%s*%s/2^10' % getRefs('%s.WSPlane' % loopid, 'Word Size', 'WSWPlanes', 'blockxcl', 'blocky')) f.write('|=%s+%s*%s*%s/2^10' % getRefs('%s.WSPencil' % loopid, 'Word Size', 'WSWPencils', 'blockxcl')) f.write( '|=%s+%s*%s/2^10' % getRefs('%s.WSCell' % loopid, 'Word Size', 'WSWCells')) f.write('|=%s' % getRefs('%s.WSPlane' % loopid)) f.write('|=%s' % getRefs('%s.WSPencil' % loopid)) f.write('|=%s' % getRefs('%s.WSCell' % loopid)) f.write('|=%s' % getRefs('%s.WSReusePlane' % loopid)) f.write('|=%s' % getRefs('%s.WSReusePencil' % loopid)) f.write('|=%s' % getRefs('%s.WSReuseCell' % loopid)) f.write('|=IF(%s=%s,%s,IF(%s, %s, IF(%s, %s, %s)))' % \ getRefs('blockGbytes', 'planeGbytes', 'WSActualPencil', \ 'NTA Hints', 'WSReusePlane', 'Streaming Writes', 'WSStreamPlane', 'WSPlane')) f.write('|=IF(%s=%s,%s,IF(%s, %s, IF(%s, %s, %s)))' % \ getRefs('planeGbytes', 'pencilGbytes', 'WSActualCell', \ 'NTA Hints', 'WSReusePencil', 'Streaming Writes', 'WSStreamPencil', 'WSPencil')) f.write('|=IF(%s, %s, IF(%s, %s, %s))' % \ getRefs('NTA Hints', 'WSReuseCell', 'Streaming Writes', 'WSStreamCell', 'WSCell')) # GBytes transferred (use results from detailed read-only array analysis) f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \ getRefs('sweeps', '%s.blockGbytes' % loopid, \ 'RWArrays', 'RW cost', 'WArrays', 'W cost', 'numBlocks', 'blockxcl', 'blocky', 'blockz')) f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \ getRefs('sweeps', '%s.planeGbytes' % loopid, \ 'RWArrays', 'RW cost', 'WArrays', 'W cost', 'numBlocks', 'blockxcl', 'blocky', 'blockz')) f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \ getRefs('sweeps', '%s.pencilGbytes' % loopid, \ 'RWArrays', 'RW cost', 'WArrays', 'W cost', 'numBlocks', 'blockxcl', 'blocky', 'blockz')) f.write('|=%s*(%s+(%s*%s+%s*%s)*%s*%s*%s*%s/2^30)' % \ getRefs('sweeps', '%s.cellGbytes' % loopid, \ 'RWArrays', 'RW cost', 'WArrays', 'W cost', 'numBlocks', 'blockxcl', 'blocky', 'blockz')) f.write('|=IF(%s<%s,MIN(%s:%s),IF(%s<%s,MIN(%s:%s),IF(%s<%s,MIN(%s:%s),%s)))' % \ getRefs('WSActualPlane', '$/thread group (kB)', 'blockGbytes', 'cellGbytes', \ 'WSActualPencil', '$/thread group (kB)', 'planeGbytes', 'cellGbytes', \ 'WSActualCell', '$/thread group (kB)', 'pencilGbytes', 'cellGbytes', \ 'cellGbytes')) # Gflops and arithmetic intensity f.write('|=%s*(%s+%s+%s+%s)*%s*%s*%s/1e9' % getRefs('sweeps', 'add', 'mul', 'div', 'spec', \ 'iterx', 'itery', 'iterz')) f.write('|=%s*(%s+%s+%s*%s+%s*%s)*%s*%s*%s/1e9' % \ getRefs('sweeps', 'add', 'mul', 'Division Cost', 'div', 'Special Cost', 'spec', \ 'iterx', 'itery', 'iterz')) f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('aGbytes', 'wGflops')) # estimated execution times f.write('|=%s/(%s*%s)' % getRefs('wGflops', 'Gflop/s/thread', 'Threads')) f.write('|=%s/(%s*%s)' % getRefs('aGbytes', 'GB/s/thread', 'Threads')) f.write('|=max(%s:%s)' % getRefs('timeCPU', 'timeRAM')) f.write('\n') # totals numLoops = sum([len(func['loops']) for func in self.info.values()]) f.write('Total/Max' + '|' * 30) f.write('|=max(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3) f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 7) f.write('|=(%s*2^30)/(%s*10^9)' % getRefs('aGbytes', 'wGflops')) f.write('|=sum(%s:%s)' % getRefs((-numLoops, 0), (-1, 0)) * 3) f.write('\n\n')