def buildIndex(r, runLogDir, desc, index, logFile): message('build %s' % desc) #t0 = now() indexPath = benchUtil.nameToIndexPath(index.getName()) if os.path.exists(indexPath): shutil.rmtree(indexPath) if REAL: indexPath, fullLogFile = r.makeIndex('nightly', index) #indexTime = (now()-t0) if REAL: os.rename(fullLogFile, '%s/%s' % (runLogDir, logFile)) s = open('%s/%s' % (runLogDir, logFile)).read() bytesIndexed = int(reBytesIndexed.search(s).group(1)) indexAtClose = reIndexAtClose.search(s).group(1) indexTimeSec = int(reIndexingTime.search(s).group(1))/1000.0 message(' took %.1f sec' % indexTimeSec) # run checkIndex checkLogFileName = '%s/checkIndex.%s' % (runLogDir, logFile) checkIndex(r, indexPath, checkLogFileName) return indexPath, indexTimeSec, bytesIndexed, indexAtClose
sourceData, postingsFormat='Lucene90', idFieldPostingsFormat='Memory', grouping=False, doDeletions=False, addDVFields=True, ) c = competition.Competitor('base', constants.TRUNK_CHECKOUT) r = benchUtil.RunAlgs(constants.JAVA_COMMAND, False, False) r.compile(c) r.makeIndex(c.name, index, False) cp = '%s' % r.classPathToString(benchUtil.getClassPath(c.checkout)) fip = '%s/index' % benchUtil.nameToIndexPath(index.getName()) modes = benchUtil.getArg('-mode', 'update', True) docsPerSec = benchUtil.getArg('-dps', '1', True) reopenPerSec = benchUtil.getArg('-rps', '0.2', True) runTimeSec = benchUtil.getArg('-rts', 60, True) numSearchThreads = benchUtil.getArg('-nst', 1, True) # default to 1 search thread numIndexThreads = benchUtil.getArg('-nit', constants.INDEX_NUM_THREADS, True) for mode in modes.split(','): allStats = [] for dps in docsPerSec.split(','): for rps in reopenPerSec.split(','): print print 'params: mode=%s docs/sec=%s reopen/sec=%s runTime(s)=%s searchThreads=%s indexThreads=%s' \
def run(id, base, challenger, coldRun=False, doCharts=False, search=False, index=False, verifyScores=True, verifyCounts=True, taskPatterns=None, randomSeed=None, requireOverlap=1.0): competitors = [challenger, base] if randomSeed is None: raise RuntimeError('missing randomSeed') #verifyScores = False r = benchUtil.RunAlgs(constants.JAVA_COMMAND, verifyScores, verifyCounts) if '-noc' not in sys.argv: print() print('Compile:') for c in competitors: r.compile(c) if not search: search = '-search' in sys.argv if not index: index = '-index' in sys.argv sum = search or '-sum' in sys.argv if index: seen = set() indexSegCount = None indexCommit = None p = False tasksFile = None for c in competitors: if tasksFile is None: tasksFile = c.tasksFile elif tasksFile != c.tasksFile: raise RuntimeError('inconsistent taskFile %s vs %s' % (taskFile, c.taskFile)) if c.index not in seen: if not p: print() print('Create indices:') p = True seen.add(c.index) r.makeIndex(id, c.index, doCharts) segCount = benchUtil.getSegmentCount( benchUtil.nameToIndexPath(c.index.getName())) if indexSegCount is None: indexSegCount = segCount indexCommit = c.commitPoint elif indexCommit == c.commitPoint and indexSegCount != segCount: raise RuntimeError( 'segment counts differ across indices: %s vs %s' % (indexSegCount, segCount)) logUpto = 0 if search: if taskPatterns != (None, None): pos, neg = taskPatterns if pos is None: if neg is None: print(' tasks file: %s' % tasksFile) else: print(' tasks file: NOT %s from %s' % (','.join(neg), tasksFile)) elif neg is None: print(' tasks file: %s from %s' % (','.join(pos), tasksFile)) else: print(' tasks file: %s, NOT %s from %s' % (','.join(pos), ','.join(neg), tasksFile)) newTasksFile = '%s/%s.tasks' % (constants.BENCH_BASE_DIR, os.getpid()) pos, neg = taskPatterns if pos is None: posPatterns = None else: posPatterns = [re.compile(x) for x in pos] if neg is None: negPatterns = None else: negPatterns = [re.compile(x) for x in neg] f = open(c.tasksFile) fOut = open(newTasksFile, 'wb') for l in f.readlines(): i = l.find(':') if i != -1: cat = l[:i] if posPatterns is not None: for p in posPatterns: if p.search(cat) is not None: #print 'KEEP: match on %s' % cat break else: continue if negPatterns is not None: skip = False for p in negPatterns: if p.search(cat) is not None: skip = True #print 'SKIP: match on %s' % cat break if skip: continue if PYTHON_MAJOR_VER < 3: fOut.write(l) else: fOut.write(l.encode('utf-8')) f.close() fOut.close() for c in competitors: c.tasksFile = newTasksFile else: print(' tasks file: %s' % c.tasksFile) newTasksFile = None try: results = {} if constants.JAVA_COMMAND.find(' -ea') != -1: print() print( 'WARNING: *** assertions are enabled *** JAVA_COMMAND=%s' % constants.JAVA_COMMAND) print() print() print('Search:') taskFiles = {} rand = random.Random(randomSeed) staticSeed = rand.randint(-10000000, 1000000) # Remove old log files: for c in competitors: for fileName in r.getSearchLogFiles(id, c): if os.path.exists(fileName): os.remove(fileName) for iter in range(base.competition.jvmCount): print(' iter %d' % iter) seed = rand.randint(-10000000, 1000000) for c in competitors: print(' %s:' % c.name) t0 = time.time() if c not in results: results[c] = [] logFile = r.runSimpleSearchBench(iter, id, c, coldRun, seed, staticSeed, filter=None, taskPatterns=taskPatterns) results[c].append(logFile) print() print('Report after iter %d:' % iter) #print ' results: %s' % results details, cmpDiffs, cmpHeap = r.simpleReport( results[base], results[challenger], '-jira' in sys.argv, '-html' in sys.argv, cmpDesc=challenger.name, baseDesc=base.name) if cmpDiffs is not None: if cmpDiffs[1]: raise RuntimeError('errors occurred: %s' % str(cmpDiffs)) if cmpDiffs[2] < requireOverlap: raise RuntimeError('results differ: %s' % str(cmpDiffs)) finally: if newTasksFile is not None and os.path.exists(newTasksFile): os.remove(newTasksFile) # TODO: maybe print this after each iter, not just in the end, for the impatient/progressive? for mode in 'cpu', 'heap': for c in competitors: print(f'\n{mode.upper()} merged search profile for {c.name}:') print(c.getAggregateProfilerResult(id, mode)[0][1]) else: results = {} for c in competitors: results[c] = r.getSearchLogFiles(id, c) details, cmpDiffs, cmpHeap = r.simpleReport(results[base], results[challenger], '-jira' in sys.argv, '-html' in sys.argv, cmpDesc=challenger.name, baseDesc=base.name) if cmpDiffs is not None: raise RuntimeError('results differ: %s' % str(cmpDiffs))
def run(id, base, challenger, coldRun=False, doCharts=False, search=False, index=False, debug=False, debugs=False, verifyScores=True, taskPatterns=None, randomSeed=None): competitors = [challenger, base] if randomSeed is None: raise RuntimeError('missing randomSeed') #verifyScores = False r = benchUtil.RunAlgs(constants.JAVA_COMMAND, verifyScores) if '-noc' not in sys.argv: print print 'Compile:' for c in competitors: r.compile(c) if not search: search = '-search' in sys.argv if not index: index = '-index' in sys.argv sum = search or '-sum' in sys.argv if index: seen = set() indexSegCount = None indexCommit = None p = False tasksFile = None for c in competitors: if tasksFile is None: tasksFile = c.tasksFile elif tasksFile != c.tasksFile: raise RuntimeError('inconsistent taskFile %s vs %s' % (taskFile, c.taskFile)) if c.index not in seen: if not p: print print 'Create indices:' p = True seen.add(c.index) r.makeIndex(id, c.index, doCharts) segCount = benchUtil.getSegmentCount(benchUtil.nameToIndexPath(c.index.getName())) if indexSegCount is None: indexSegCount = segCount indexCommit = c.commitPoint elif indexCommit == c.commitPoint and indexSegCount != segCount: raise RuntimeError('segment counts differ across indices: %s vs %s' % (indexSegCount, segCount)) logUpto = 0 if search: if taskPatterns is not (None, None): pos, neg = taskPatterns if pos is None: if neg is None: print ' tasks file: %s' % tasksFile else: print ' tasks file: NOT %s from %s' % (','.join(neg), tasksFile) elif neg is None: print ' tasks file: %s from %s' % (','.join(pos), tasksFile) else: print ' tasks file: %s, NOT %s from %s' % (','.join(pos), ','.join(neg), tasksFile) newTasksFile = '%s/%s.tasks' % (constants.BENCH_BASE_DIR, os.getpid()) pos, neg = taskPatterns if pos is None: posPatterns = None else: posPatterns = [re.compile(x) for x in pos] if neg is None: negPatterns = None else: negPatterns = [re.compile(x) for x in neg] f = open(c.tasksFile) fOut = open(newTasksFile, 'wb') for l in f.readlines(): i = l.find(':') if i != -1: cat = l[:i] if posPatterns is not None: for p in posPatterns: if p.search(cat) is not None: #print 'KEEP: match on %s' % cat break else: continue if negPatterns is not None: skip = False for p in negPatterns: if p.search(cat) is not None: skip = True #print 'SKIP: match on %s' % cat break if skip: continue fOut.write(l) f.close() fOut.close() for c in competitors: c.tasksFile = newTasksFile else: print ' tasks file: %s' % c.tasksFile newTasksFile = None try: results = {} if constants.JAVA_COMMAND.find(' -ea') != -1: print print 'WARNING: *** assertions are enabled *** JAVA_COMMAND=%s' % constants.JAVA_COMMAND print print print 'Search:' taskFiles = {} rand = random.Random(randomSeed) staticSeed = rand.randint(-10000000, 1000000) # Remove old log files: for c in competitors: for fileName in r.getSearchLogFiles(id, c): if os.path.exists(fileName): os.remove(fileName) for iter in xrange(base.competition.jvmCount): print ' iter %d' % iter seed = rand.randint(-10000000, 1000000) for c in competitors: print ' %s:' % c.name t0 = time.time() if c not in results: results[c] = [] logFile = r.runSimpleSearchBench(iter, id, c, coldRun, seed, staticSeed, filter=None, taskPatterns=taskPatterns) results[c].append(logFile) print print 'Report after iter %d:' % iter #print ' results: %s' % results details, cmpDiffs, cmpHeap = r.simpleReport(results[base], results[challenger], '-jira' in sys.argv, '-html' in sys.argv, cmpDesc=challenger.name, baseDesc=base.name) if cmpDiffs is not None: raise RuntimeError('results differ: %s' % str(cmpDiffs)) finally: if newTasksFile is not None and os.path.exists(newTasksFile): os.remove(newTasksFile) else: results = {} for c in competitors: results[c] = r.getSearchLogFiles(id, c) details, cmpDiffs, cmpHeap = r.simpleReport(results[base], results[challenger], '-jira' in sys.argv, '-html' in sys.argv, cmpDesc=challenger.name, baseDesc=base.name) if cmpDiffs is not None: raise RuntimeError('results differ: %s' % str(cmpDiffs))
index = comp.newIndex(constants.TRUNK_CHECKOUT, sourceData, postingsFormat='Lucene50', idFieldPostingsFormat='Memory', grouping=False, doDeletions=False, addDVFields=True, ) c = competition.Competitor('base', constants.TRUNK_CHECKOUT) r = benchUtil.RunAlgs(constants.JAVA_COMMAND, False) r.compile(c) r.makeIndex(c.name, index, False) cp = '%s' % r.classPathToString(r.getClassPath(c.checkout)) fip = '%s/index' % benchUtil.nameToIndexPath(index.getName()) modes = benchUtil.getArg('-mode', 'update', True) docsPerSec = benchUtil.getArg('-dps', '1', True) reopenPerSec = benchUtil.getArg('-rps', '0.2', True) runTimeSec = benchUtil.getArg('-rts', 60, True) numSearchThreads = benchUtil.getArg('-nst', 1, True) # default to 1 search thread numIndexThreads = benchUtil.getArg('-nit', constants.INDEX_NUM_THREADS, True) for mode in modes.split(','): allStats = [] for dps in docsPerSec.split(','): for rps in reopenPerSec.split(','): print print 'params: mode=%s docs/sec=%s reopen/sec=%s runTime(s)=%s searchThreads=%s indexThreads=%s' \ % (mode, dps, rps, runTimeSec, numSearchThreads, numIndexThreads) reopenStats = runOne(classpath=cp,
def run(): DO_RESET = '-reset' in sys.argv print print print print message('start') id = 'nightly' if not REAL: start = datetime.datetime(year=2011, month=5, day=19, hour=23, minute=00, second=01) else: start = now() timeStamp = '%04d.%02d.%02d.%02d.%02d.%02d' % (start.year, start.month, start.day, start.hour, start.minute, start.second) runLogDir = '%s/%s' % (NIGHTLY_LOG_DIR, timeStamp) if REAL: os.makedirs(runLogDir) message('log dir %s' % runLogDir) os.chdir('%s/%s' % (constants.BASE_DIR, NIGHTLY_DIR)) if not REAL: svnRev = '1102160' luceneUtilRev = '2270c7a8b3ac+ tip' print 'SVN rev is %s' % svnRev print 'luceneutil rev is %s' % luceneUtilRev else: runCommand('%s cleanup' % constants.SVN_EXE) iters = 30 if True: for i in range(iters): try: runCommand('%s update > %s/update.log' % (constants.SVN_EXE, runLogDir)) except RuntimeError: message(' retry...') time.sleep(60.0) else: svnRev = int(reSVNRev.search(open('%s/update.log' % runLogDir, 'rb').read()).group(1)) print 'SVN rev is %s' % svnRev break else: raise RuntimeError('failed to run svn update after %d tries' % iters) else: svnRev = 1417276 print 'using canned svn rev %s' % svnRev luceneUtilRev = os.popen('hg id %s' % constants.BENCH_BASE_DIR).read().strip() print 'luceneutil rev is %s' % luceneUtilRev javaVersion = os.popen('%s -fullversion 2>&1' % constants.JAVA_COMMAND).read().strip() print '%s' % javaVersion print 'Java command-line: %s' % constants.JAVA_COMMAND runCommand('%s clean > clean.log 2>&1' % constants.ANT_EXE) r = benchUtil.RunAlgs(constants.JAVA_COMMAND, True) comp = competition.Competition(taskRepeatCount=TASK_REPEAT_COUNT, taskCountPerCat=COUNTS_PER_CAT) mediumSource = competition.Data('wikimedium', MEDIUM_LINE_FILE, MEDIUM_INDEX_NUM_DOCS, constants.WIKI_MEDIUM_TASKS_FILE) fastIndexMedium = comp.newIndex(NIGHTLY_DIR, mediumSource, analyzer='StandardAnalyzerNoStopWords', postingsFormat='Lucene41', numThreads=constants.INDEX_NUM_THREADS, directory=DIR_IMPL, idFieldPostingsFormat='Memory', ramBufferMB=INDEXING_RAM_BUFFER_MB, waitForMerges=False, grouping=False, verbose=False, mergePolicy='TieredMergePolicy', maxConcurrentMerges=3) bigSource = competition.Data('wikibig', BIG_LINE_FILE, BIG_INDEX_NUM_DOCS, constants.WIKI_MEDIUM_TASKS_FILE) fastIndexBig = comp.newIndex(NIGHTLY_DIR, bigSource, analyzer='StandardAnalyzerNoStopWords', postingsFormat='Lucene41', numThreads=constants.INDEX_NUM_THREADS, directory=DIR_IMPL, idFieldPostingsFormat='Memory', ramBufferMB=INDEXING_RAM_BUFFER_MB, waitForMerges=False, grouping=False, verbose=False, mergePolicy='TieredMergePolicy', maxConcurrentMerges=3) # Must use only 1 thread so we get same index structure, always: index = comp.newIndex(NIGHTLY_DIR, mediumSource, analyzer='StandardAnalyzerNoStopWords', postingsFormat='Lucene41', numThreads=1, directory=DIR_IMPL, idFieldPostingsFormat='Memory', mergePolicy='LogDocMergePolicy', doFacets=True, maxConcurrentMerges=3) c = comp.competitor(id, NIGHTLY_DIR, index=index, directory=DIR_IMPL, analyzer='StandardAnalyzerNoStopWords', commitPoint='multi', doFacets=True) #c = benchUtil.Competitor(id, 'trunk.nightly', index, DIR_IMPL, 'StandardAnalyzerNoStopWords', 'multi', constants.WIKI_MEDIUM_TASKS_FILE) if REAL: r.compile(c) # 1: test indexing speed: small (~ 1KB) sized docs, flush-by-ram medIndexPath, medIndexTime, medBytesIndexed, atClose = buildIndex(r, runLogDir, 'medium index (fast)', fastIndexMedium, 'fastIndexMediumDocs.log') message('medIndexAtClose %s' % atClose) # 2: NRT test nrtResults = runNRTTest(r, medIndexPath, runLogDir) # 3: test indexing speed: medium (~ 4KB) sized docs, flush-by-ram ign, bigIndexTime, bigBytesIndexed, atClose = buildIndex(r, runLogDir, 'big index (fast)', fastIndexBig, 'fastIndexBigDocs.log') message('bigIndexAtClose %s' % atClose) # 4: test searching speed; first build index, flushed by doc count (so we get same index structure night to night) indexPathNow, ign, ign, atClose = buildIndex(r, runLogDir, 'search index (fixed segments)', index, 'fixedIndex.log') message('fixedIndexAtClose %s' % atClose) fixedIndexAtClose = atClose indexPathPrev = '%s/trunk.nightly.index.prev' % constants.INDEX_DIR_BASE if os.path.exists(indexPathPrev) and os.path.exists(benchUtil.nameToIndexPath(index.getName())): segCountPrev = benchUtil.getSegmentCount(indexPathPrev) segCountNow = benchUtil.getSegmentCount(benchUtil.nameToIndexPath(index.getName())) if segCountNow != segCountPrev: # raise RuntimeError('different index segment count prev=%s now=%s' % (segCountPrev, segCountNow)) print 'WARNING: different index segment count prev=%s now=%s' % (segCountPrev, segCountNow) # Search rand = random.Random(714) staticSeed = rand.randint(-10000000, 1000000) #staticSeed = -1492352 message('search') t0 = now() coldRun = False comp = c comp.tasksFile = '%s/wikinightly.tasks' % constants.BENCH_BASE_DIR comp.printHeap = True if REAL: resultsNow = [] for iter in xrange(JVM_COUNT): seed = rand.randint(-10000000, 1000000) resultsNow.append(r.runSimpleSearchBench(iter, id, comp, coldRun, seed, staticSeed, filter=None)) else: resultsNow = ['%s/%s/modules/benchmark/%s.%s.x.%d' % (constants.BASE_DIR, NIGHTLY_DIR, id, comp.name, iter) for iter in xrange(20)] message('done search (%s)' % (now()-t0)) resultsPrev = [] searchResults = searchHeap = None for fname in resultsNow: prevFName = fname + '.prev' if os.path.exists(prevFName): resultsPrev.append(prevFName) if not DO_RESET: output = [] results, cmpDiffs, searchHeaps = r.simpleReport(resultsPrev, resultsNow, False, True, 'prev', 'now', writer=output.append) f = open('%s/%s.html' % (NIGHTLY_REPORTS_DIR, timeStamp), 'wb') timeStamp2 = '%s %02d/%02d/%04d' % (start.strftime('%a'), start.month, start.day, start.year) w = f.write w('<html>\n') w('<h1>%s</h1>' % timeStamp2) w('Lucene/Solr trunk rev %s<br>' % svnRev) w('luceneutil rev %s<br>' % luceneUtilRev) w('%s<br>' % javaVersion) w('Java command-line: %s<br>' % htmlEscape(constants.JAVA_COMMAND)) w('Index: %s<br>' % fixedIndexAtClose) w('<br><br><b>Search perf vs day before</b>\n') w(''.join(output)) w('<br><br>') w('<img src="%s.png"/>\n' % timeStamp) w('</html>\n') f.close() if os.path.exists('out.png'): shutil.move('out.png', '%s/%s.png' % (NIGHTLY_REPORTS_DIR, timeStamp)) searchResults = results print ' heaps: %s' % str(searchHeaps) if cmpDiffs is not None: warnings, errors = cmpDiffs print 'WARNING: search result differences: %s' % str(warnings) if len(errors) > 0: raise RuntimeError('search result differences: %s' % str(errors)) else: cmpDiffs = None results = (start, MEDIUM_INDEX_NUM_DOCS, medIndexTime, medBytesIndexed, BIG_INDEX_NUM_DOCS, bigIndexTime, bigBytesIndexed, nrtResults, searchResults, svnRev, luceneUtilRev, searchHeaps) for fname in resultsNow: shutil.copy(fname, runLogDir) if os.path.exists(fname + '.stdout'): shutil.copy(fname + '.stdout', runLogDir) if REAL: for fname in resultsNow: shutil.move(fname, fname + '.prev') if not DEBUG: # print 'rename %s to %s' % (indexPathNow, indexPathPrev) if os.path.exists(indexPathNow): if os.path.exists(indexPathPrev): shutil.rmtree(indexPathPrev) os.rename(indexPathNow, indexPathPrev) os.chdir(runLogDir) runCommand('tar cjf logs.tar.bz2 *') for f in os.listdir(runLogDir): if f != 'logs.tar.bz2': os.remove(f) if DEBUG: resultsFileName = 'results.debug.pk' else: resultsFileName = 'results.pk' open('%s/%s' % (runLogDir, resultsFileName), 'wb').write(cPickle.dumps(results)) if REAL: runCommand('chmod -R a-w %s' % runLogDir) message('done: total time %s' % (now()-start))