opt = zopt elif cc != None: opt = opt + ' UCC=' + cc + ' UCCFLAGS="' + ccf + outf + '"' else: opt = opt + ' UCCFLAGS="' + outf + '"' # print "opt = '%s'" % opt cmnd = 'make %s%scase N=%d urout=%s %s' % (pre, blas, N, rout, opt) cmnds = 'cd %s/tune/blas/level1/%s ; %s ' % (ATLdir, ARCH, cmnd) print "cmnds = '%s'" % (cmnds) fo = os.popen(cmnds, 'r') lines = fo.readlines() err = fo.close() (IFKOdir, fko) = fkocmnd.GetFKOinfo() OUTdir = IFKOdir + '/blas/assembly' (ATLdir, ARCH) = fkocmnd.FindAtlas(IFKOdir) print ARCH print "ATLdir='%s', ARCH='%s'" % (ATLdir, ARCH) # [time,mflop] = l1cmnd.l1time(ATLdir, ARCH, 'd', 'dot', 80000, 'dot1_x1y1.c') # print "time=%f, mflop=%f" % (time,mflop) # # Defaults # N = 80000 pres = l1cmnd.GetDefaultPre() l1routs = l1cmnd.GetDefaultBlas()
def ifko0(l1bla, pre, N, M=None, lda=None): (IFKOdir, fko) = fkocmnd.GetFKOinfo() (ATLdir, ARCH) = fkocmnd.FindAtlas(IFKOdir) rout = IFKOdir + '/blas/' + pre + l1bla + '.b' #outf = ATLdir + '/tune/blas/level1/' + l1bla.upper() + '/fkorout.s' outf = ATLdir + kernels.GetBlasPath(l1bla) + '/fkorout.s' # # Majedul: calling new info func, info represents the old list # new data: [npath, red2onePath, vecMethod, vpathinfo, arrtypes] at the end # #info = fkocmnd.info(fko, rout) newinfo = fkocmnd.NewInfo(fko, rout) info = [newinfo[i] for i in range(11) ] [npath, red1path, vecm, vpath, arrtypes] = [ newinfo[i] for i in range(11,16)] ncache = info[0] vec = info[5] #(fparrs, fpsets, fpuses, fpurs) = fkocmnd.GetFPInfo(info) (fparrs, fpsets, fpuses, fpurs) = fkocmnd.GetFPInfo(newinfo) nfp = len(fparrs) # # Findout the default flags (it includes vector, default prefetch and unroll) # #KFLAGS = fkocmnd.GetStandardFlags(fko, rout, pre) KFLAGS = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, URF) print "\n Default Flag = " + KFLAGS KFLAGS = KFLAGS + " -o " + str(outf) + " " + rout mflist = [] testlist = [] #print KFLAGS # # Majedul: default and vect case would not be same now. Vspec may be # worse than NonVec case. # So, I will choose the best as the default for the later optimization # # # check best scalar xforms, delete any vector flag # #j = KFLAGS.find("-V") #if j != -1 : #KFn = KFLAGS[0:j-1] + KFLAGS[j+2:] # # find out best standard scalar flag # KFn = fkocmnd.GetOptStdFlags(fko, rout, pre, 0, URF) KFn = KFn + " -o " + str(outf) + " " + rout #print KFn # # standard flag without vect # KF0 = KFn fkocmnd.callfko(fko, KF0) [t0,mf0] = cmnd.time(ATLdir, ARCH, pre, l1bla, N, M, lda, "fkorout.s", "gcc", "-x assembler-with-cpp", opt=opt) mflist.append(mf0) testlist.append("default") ## this is using std flags print "\n Default Flag = " + KF0 # # Finding the best path reduction option # if npath > 1: [mfs, KFs] = ifko_PathXform(ATLdir, ARCH, KFn, ncache, fko, rout, pre, l1bla, N, M, lda, npath, red1path) mflist.append(mfs) testlist.append("PathXform") if (mfs > mf0) : mf0 = mfs KF0 = KFs # # Finding the best vector option with/without path reduction # global isSV; if SB: KFv = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, SB, URF) else: KFv = fkocmnd.GetOptStdFlags(fko, rout, pre, 1, 0, URF) print "\n Standad Flag for Vect = " + KFv KFv = KFv + " -o " + str(outf) + " " + rout if vec: if 'v' in skipOpt: print '\n SKIPPING VECTORIZATION' else: [mfv, KFv] = ifko_Vec(ATLdir, ARCH, KFv, ncache, fko, rout, pre, l1bla, N, M, lda, npath, vecm, vpath) mflist.append(mfv) testlist.append("vect") if (mfv > mf0) : mf0 = mfv KF0 = KFv # # if we have forceOpt, we will keep vec even if it's not better # elif 'sv' in forceOpt or 'vrc' in forceOpt or 'vmmr' in forceOpt: print '\n FORCING VECTORIZATION' mf0 = mfv KF0 = KFv else: # no vector is selected, skip the SB too # isSV = 0 # # choose the better as the ref of later opt # KFLAGS = KF0 mf = mf0 print "\n FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS) # # Previous code which is substituted by the above codes # # # Find performance of default case # # j = KFLAGS.find("-V") # if j != -1 : # KFn = KFLAGS[0:j-1] + KFLAGS[j+2:] # fkocmnd.callfko(fko, KFn) # [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", # "gcc", "-x assembler-with-cpp", opt=opt) # mflist.append(mf) # testlist.append("default") # fkocmnd.callfko(fko, KFLAGS) # [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", # "gcc", "-x assembler-with-cpp", opt=opt) # else : # fkocmnd.callfko(fko, KFLAGS) # [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", # "gcc", "-x assembler-with-cpp", opt=opt) # testlist.append("default") # mflist.append(mf) # mflist.append(mf) # testlist.append("vect") # # Eventually, want to try both -V and scalar, but for now, use -V whenever # possible # # Find if we want to use cache-through writes on any arrays # if 'wnt' in skipOpt: print '\n SKIPPING WNT' else: n = len(fpsets) i = 0 wnt = [] while i < n: if fpsets[i] > 0 : # and fpuses[i] == 0: wnt.append(fparrs[i]) i += 1 if len(wnt) > 0: [mf,KFLAGS,wnt] = ifko_writeNT(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, M, lda, wnt) mflist.append(mf) testlist.append("writeNT") # # Find best PFD for each pfarr # pfarrs = fparrs pfsets = fpsets for arr in pfarrs: [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N,M,lda, info, arr) mflist.append(mf) testlist.append("pfdist") KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs) # # Find best pf type # [mf,KFLAGS] = ifko_pftype(ATLdir, ARCH, KFLAGS, ncache, fko, rout, pre, l1bla, N, M, lda, info, pfarrs, pfsets) mflist.append(mf) testlist.append("pftype") print "\n FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS) # # Find best unroll # if URF: print '\n SKIPPING UNROLL TUNNING : FORCED TO %d' %URF else: [mf,KFLAGS] = FindUR(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, M, lda, info) mflist.append(mf) testlist.append("unroll") # # Find best bet for over speculation # FIXME: find out the -U and pass it to the function # FIXME: can't apply Over Spec if there is a memory write inside the loop # if isSV: if l1bla.find("irk1amax") != -1: print '\n SKIPPING STRONGER BET UNROLLING for IRK1AMAX' elif l1bla.find("irk2amax") != -1: print '\n SKIPPING STRONGER BET UNROLLING for IRK2AMAX' elif l1bla.find("irk3amax") != -1: print '\n SKIPPING STRONGER BET UNROLLING for IRK3AMAX' elif l1bla.find("sin") != -1: print '\n SKIPPING STRONGER BET UNROLLING for SIN' elif l1bla.find("cos") != -1: print '\n SKIPPING STRONGER BET UNROLLING for COS' else: [mf,KFLAGS] = FindBET(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, M, lda) mflist.append(mf) testlist.append("OverSpec") # # See if we can apply accumulator expansion # # acc = fkocmnd.GetFPAccum(info) # nacc = len(acc) # if nacc > 0 and nacc < 3: # [mf,KFLAGS] = FindAE(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, acc) # mflist.append(mf) # testlist.append("accexpans") # # Majedul: See if we can apply scalar expansion (accexpan + man/min expansion) # acc = fkocmnd.GetFPAccum(info) nacc = len(acc) if 're' in skipOpt: print '\n SKIPPING SCALAR EXPANSION' elif isSV: print '\n SKIPPING SCALAR EXPANSION: NOT SUPPORTED WITH SV' elif l1bla.find("iamax") != -1: print '\n SKIPPING SCALAR EXPANSION FOR IAMAX' else: if nacc > 0 and nacc < 3: [mf,KFLAGS] = FindRE(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, M, lda, acc) mflist.append(mf) testlist.append("rdexp") # # Majedul: shifted it here to test # # # Find if we want to use cache-through writes on any arrays # """if 'wnt' in skipOpt: print '\n SKIPPING WNT' else: n = len(fpsets) i = 0 wnt = [] while i < n: if fpsets[i] > 0 : # and fpuses[i] == 0: wnt.append(fparrs[i]) i += 1 if len(wnt) > 0: [mf,KFLAGS,wnt] = ifko_writeNT(ATLdir, ARCH, KFLAGS, fko, rout, pre, l1bla, N, wnt) mflist.append(mf) testlist.append("writeNT") # # Find best PFD for each pfarr # pfarrs = fparrs pfsets = fpsets for arr in pfarrs: [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N, M, lda, info, arr) mflist.append(mf) testlist.append("pfdist") KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs) # # Find best pf type # [mf,KFLAGS] = ifko_pftype(ATLdir, ARCH, KFLAGS, ncache, fko, rout, pre, l1bla, N, info, pfarrs, pfsets) mflist.append(mf) testlist.append("pftype") print "\n FLAGS so far =", fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS) """ # # tesing: re-tune the prefetch distance! # NOTE: this re-tuning can be omitted just by enabling the comment # #""" #KFLAGS = fkocmnd.SetDefaultPFD(KFLAGS, info) KFLAGS = fkocmnd.SetDefaultPFD(KFLAGS, newinfo) #print "default PFD: ", KFLAGS print "\n TUNING PFD AGAIN: " for arr in pfarrs: [mf,KFLAGS] = FindPFD(ATLdir, ARCH, KFLAGS, fko, rout, pre,l1bla, N, M, lda, info, arr) KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs) # # FIXME: it will create problem for the calculaton of % of improvement # # if 'pfdist' in testlist: # j = testlist.index('pfdist') # mflist[j] = mf # else: # mflist.append(mf) # testlist.append("pfdist") #KFLAGS = fkocmnd.RemoveRedundantPrefFlags(KFLAGS, pfarrs) mflist.append(mf) testlist.append("pfd2") #""" # # Find performance of best case # # fkocmnd.callfko(fko, KFLAGS) # [t,mf] = l1cmnd.time(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", # "gcc", "-x assembler-with-cpp", opt=opt) print "\n\n BEST FLAGS FOUND (%.2f) = %s" % (mf, fkocmnd.RemoveFilesFromFlags(l1bla, KFLAGS)) res = fkocmnd.GetOptVals(KFLAGS, pfarrs, pfsets, acc) tst = cmnd.test(ATLdir, ARCH, pre, l1bla, N, M, lda, "fkorout.s", cc="gcc", ccf="-x assembler-with-cpp", opt=optT) #tst = l1cmnd.silent_test(ATLdir, ARCH, pre, l1bla, N, "fkorout.s", # cc="gcc", ccf="-x assembler-with-cpp", opt=optT) return(res, KFLAGS, mf, tst, testlist, mflist)