def process(data, overwrite=False): prog.genGauss(data, overwrite=overwrite, printcmd=True) prog.vec2bin(data, overwrite, printcmd=True) prog.vec2hdf5(data, overwrite, printcmd=True) prog.vec2vect(data, overwrite, printcmd=True) oldfold = data.cfg.F for i in range(data.cfg['nfolds']): from runlsh import runkbench data.cfg.F = i nd = dh.Data(data.cfg) runkbench(nd, overwrite) data.cfg.F = oldfold # confName = "gaussoraConfig_nclus=%d_dim=%d_var=%s.txt" %(nclus,dim,var) gcprog = "gaussoraConf.pl" # confFile = "%s/%s" %(confDir,confName) gprog = "gaussora" # dataName = "gaussian_nclus=%d_dim=%d_var=%s_size=%d.vec" %(nclus,dim,var,size) # dataFile = "%s/%s" %(dataDir,dataName) # dataName2 = "gaussian_nclus=%d_dim=%d_var=%s_size=%d.vect" %(nclus,dim,var,size) # dataFile2 = "%s/%s" %(dataDir,dataName2) # convertProg1 = "vec2hdf5" # hdf5Name = "gaussian_nclus=%d_dim=%d_var=%s_size=%d.hdf5" %(nclus,dim,var,size) # hdf5File = "%s/%s" %(dataDir,hdf5Name) # convertProg2 = "vec2bin" # vbinName = "gaussian_nclus=%d_dim=%d_var=%s_size=%d.lbin" %(nclus,dim,var,size) # vbinFile = "%s/%s" %(dataDir,vbinName) createQueryProy = "createQueries"
def getlsh(data): allv = [] for t in dh.LSHTypeEnum.getValidTypes(): data.cfg['lshtype'] = t nd = dh.Data(data.cfg) try: fs = LSHStatter(nd.getFoldedFiles('lshbenchfilepath'), nd) allv.append(fs) except: allv.append(NOStatter(t.name.upper(), nd)) return allv
def fullprocess(data, overwritedata=False, overwriteindex=False, overwritebench=False): dh.Data.mkdirs(data.indexdir, data.querydir, data.resultdir, data.confdir) vec2bin(data, overwritedata) vec2hdf5(data, overwritedata) bestcfg = findbest(data, overwritedata, overwriteindex, overwritebench) data = dh.Data(bestcfg) print("Best M, L ", bestcfg['lshM'], bestcfg['lshL']) d = process(data, overwritedata, overwriteindex, overwritebench) st = LSHStatter(d.getFoldedFiles('lshbenchfilepath'), d) sprinter.printstats(st) return d, st
def findbest(data, overwritedata=False, overwriteindex=False, overwritebench=False): sq = math.sqrt(data.S) cfg = copy.deepcopy(data.cfg) bestcfg = copy.deepcopy(data.cfg) cfg['nfolds'] = 2 bestcost = 0 grid = [] for M in [sq / 8]: for L in [4]: for N in [6]: grid.append((M, L, N)) for M, L, N in grid: try: cfg['lshM'] = int(M) cfg['lshL'] = int(L) cfg['lshN'] = int(N) print("cfg", cfg['lshM']) for i in range(cfg['nfolds']): cfg.F = i rundata = process(dh.Data(cfg)) st = LSHStatter(rundata.getFoldedFiles('lshbenchfilepath'), data) weightedcost = (st.precision + st.recall + (1 - st.cost)) / 3 if weightedcost > bestcost: bestcost = weightedcost bestcfg = copy.deepcopy(cfg) except: traceback.print_exc() bestcfg['nfolds'] = data.cfg['nfolds'] return bestcfg
printcmd=True) def gendata(data, overwrite=False): dh.Data.mkdirs(data.benchdir, data.confdir, data.indexdir, data.querydir) vec2hdf5(data, overwrite=overwrite) if data.cfg.synthetic: genGauss.process(data, overwrite) _vec2hdf5(data.qvecfilepath, data.qhdf5filepath, overwrite=overwrite, printcmd=True) if __name__ == "__main__": if len(sys.argv) == 1: sys.argv = sysarg.args(__file__) overwrited = '--overwritedata' in sys.argv overwritei = '--overwriteindex' in sys.argv overwriteb = '--overwritebench' in sys.argv args, unknown = sysarg.getParsed(sys.argv, True) print(args) cfg = config.Config(vars(args)) ocfg = copy.deepcopy(cfg) data = dh.Data(cfg) addLogFile(data.logfile) fullprocess(data, overwrited, overwriteb)
traceback.print_exc() bestcfg['nfolds'] = data.cfg['nfolds'] return bestcfg if __name__ == "__main__": if len(sys.argv) == 1: sys.argv = sysarg.args(__file__) overwritei = '--overwriteindex' in sys.argv overwrited = '--overwritedata' in sys.argv overwriteb = '--overwritebench' in sys.argv ap = sysarg.getArgParse(sys.argv, needsquerydata=True) args, unknown = ap.parse_known_args() runcfg = config.Config(vars(args)) rundata = dh.Data(runcfg) addLogFile(rundata.logfile) if runcfg.synthetic: genGauss.process(rundata, overwrited) rundata = fullprocess(rundata, overwritedata=overwrited, overwriteindex=overwritei, overwritebench=overwriteb) # printl('config', 'avgcalcs', 'meanquerytime', 'precision', 'recall', 'cost') files = rundata.getFoldedFiles('lshbenchfilepath') # print(rundata.lshbenchfilepath) # print(files) st = LSHStatter(files, rundata)