def initBuildQueues(dConfig): # purge in-progress queue qRedis = RedisQueue(dConfig['redis-queue-building'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) qRedis.flush() # purge to-build queue qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) qRedis.flush()
def findProjects(sCorpusPath, dConfig): qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) iCount = 0 for sRoot, lDirs, lFiles in os.walk(sCorpusPath): iLevel = sRoot.count(os.sep) if iLevel >= 11: del lDirs[:] if iLevel == 11: if dConfig['debug']: debug('func: findProjects()', 'projects-root:', sRoot, iLevel) debug('func: findProjects()', 'projects-root:', sRoot, iLevel) qRedis.put(sRoot) iCount += 1 if dConfig['debug'] and iCount >= 10: break printMsg('func: findProjects()', str(iCount), 'projects loaded into queue for processing')
def writeBuildSummaries(dConfig): qRedis = RedisQueue(dConfig['redis-queue-json'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) while 1: # get next project summary to process sProjectSummary = qRedis.get(block=True, timeout=30) if sProjectSummary: # do something with summary dProjectSummary = json.loads(sProjectSummary) #sBuildPath = os.path.relpath(dProjectSummary['sourcePath'], '/nfscorpus/nfscorpus') #sBuildPath = os.path.join('/nfsbuild/nfsbuild', sBuildPath) if "_8tof" in dProjectSummary['sourcePath']: sBuildPath = os.path.relpath(dProjectSummary['sourcePath'], '/data/corpus_8tof') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sBuildPath) if "_0to7" in dProjectSummary['sourcePath']: sBuildPath = os.path.relpath(dProjectSummary['sourcePath'], '/data/corpus_0to7') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sBuildPath) (sBuildPath, _) = os.path.split(sBuildPath) # ensure build directory exists sCmd = 'mkdir -p ' + sBuildPath if dConfig['debug']: debug('func: writeBuildSummaries() mkdir cmd:', sCmd) os.system(sCmd) sJsonPath = os.path.join(sBuildPath, 'build.json') if dConfig['debug']: debug('func: writeBuildSummaries() sJsonPath:', sJsonPath) with open(sJsonPath, 'w') as fJson: fJson.write(json.dumps(dProjectSummary, indent=4)) else: break
def initProjects(dConfig): # flush project queue; queue used to traverse projects (reset every time) qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) qRedis.flush() dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # ensure projects table is empty before adding project crawl #Nate dMp.flush(sTable='projects',bDebug=dConfig['debug']) #Nate want to preserve projects table b/c I am adding the second half of corpus to it dMp.close()
def processProjects(dConfig): qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) oES = Elasticsearch(dConfig['es-instance-locs']) while 1: # get next project to process sProjectPath = qRedis.get(block=True, timeout=30) if sProjectPath: findProjectFiles( (sProjectPath, oES, dConfig) ) else: break
def processProjects(dConfig): qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc']) while 1: # get next project to process sProjectPath = qRedis.get(block=True, timeout=30) if sProjectPath: changePerms((sProjectPath, dConfig)) else: break
def findProjects(sCorpusPath, iForks, dConfig): lProjectPaths = [] if dConfig['redis']: qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc']) # ensure redis queue is empty prior to starting consumers qRedis.flush() iCount = 0 for sRoot, lDirs, lFiles in os.walk(sCorpusPath): iLevel = sRoot.count(os.sep) if iLevel >= 11: del lDirs[:] if iLevel == 11: if dConfig['debug']: debug('func: findProjects()', 'projects-root:', sRoot, iLevel) if dConfig['redis']: qRedis.put(sRoot) else: lProjectPaths.append(sRoot) iCount += 1 if dConfig['debug'] and iCount >= 1: break printMsg('func: findProjects()', str(iCount), 'projects loaded into queue for processing') return lProjectPaths
def initTargets(dConfig): # flush source targets queue qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) qRedis.flush() # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) # qRedis.flush() dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # truncate sourceTargets table before re-populating # dMp.flush(sTable='sourceTargets', bDebug=dConfig['debug']) # truncate buildTargets table before re-populating # dMp.flush(sTable='buildTargets', bDebug=dConfig['debug']) dMp.close()
def main(argv): # defaults sCorpusPath = '/data/builder_SAN2/RAT' # sCorpusPath = '/data/corpus_0to7' # sCorpusPath = '/data/corpus_8tof' dConfig = {} dConfig['es-bulk-chunk-size'] = 500 dConfig['debug'] = False # binding to muse2 doesn't work right now dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int'] #dConfig['es-instance-locs'] = ['muse2-int','muse3-int'] #dConfig['es-instance-locs'] = ['muse3-int'] dConfig['es-index-name'] = 'rat-corpus-source' dConfig['es-index-type'] = 'files' dConfig['redis-queue-name'] = 'rat-project-paths' dConfig['redis-loc'] = 'muse2-int' dConfig['redis-port'] = '12345' dConfig['redis'] = False dConfig['time-stamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') iForks = 5 bError = False ### command line argument handling options, remainder = getopt.getopt( sys.argv[1:], 'c:f:rd', ['corpus-dir-path=', 'forks=', 'redis', 'debug']) # debug('func: main()', 'options:', options) # debug('func: main()', 'remainder:', remainder) for opt, arg in options: if opt in ('-c', '--corpus-dir-path'): sCorpusPath = arg elif opt in ('-d', '--debug'): dConfig['debug'] = True elif opt in ('-r', '--redis'): dConfig['redis'] = True elif opt in ('-f', '--forks'): try: iForks = int(arg) except ValueError as e: bError = True if not os.path.isdir(sCorpusPath): bError = True if bError: usage() else: iStart = time.time() #oES = createESIndex(dConfig) oES = Elasticsearch(dConfig['es-instance-locs']) ### setup producer lProjectPaths = [] if dConfig['redis']: qRedis = RedisQueue(dConfig['redis-queue-name'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) # ensure redis queue is empty prior to starting consumers # qRedis.flush() # call producer process that populates redis queue with project path roots pProducer = multiprocessing.Process(target=findProjects, args=(qRedis, sCorpusPath, dConfig)) pProducer.start() else: lProjectPaths = findProjects(None, sCorpusPath, dConfig) ### setup consumers lArgs = [] iForks = 1 if dConfig['redis']: # create pool of workers oPool = multiprocessing.Pool(processes=iForks) for i in range(0, iForks): lArgs.append(dConfig) ### do work -- use pool of workers to descend into each project path recording/ingesting all file names oPool.map(processProjects, lArgs) pProducer.join() oPool.close() oPool.join() else: for sPath in lProjectPaths: findProjectFiles((sPath, oES, dConfig)) if dConfig['debug']: debug('func: main()', "all processes completed") # es index was created with replication turned off for speed, turn on replicating shards turnReplicationOn(oES, dConfig) # refresh to make the documents available for search oES.indices.refresh(index=dConfig['es-index-name']) # and now we can count the documents printMsg('func: main()', 'number of documents in', dConfig['es-index-name'], 'index: ', oES.count(index=dConfig['es-index-name'])['count']) iEnd = time.time() printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def processBuildTargets(tTup): try: (iContainerId, dArgs, dConfig) = tTup # dual queues -- primary for getting what project to build next, secondary to mark what is being built qRedis = RedisQueue(name=dConfig['redis-queue-to-build'], name2=dConfig['redis-queue-building'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) # set of existing builds for this os container used to prune out projects already built with this container sExistingBuilds = RedisSet(name=dConfig['redis-already-built-nate'], namespace='set', host=dConfig['redis-loc'], port=dConfig['redis-port']) debug('func: processBuildTargets(), has ' + str(len(sExistingBuilds)) + ' built projects') iCtr = 0 while 1: sBuildTarget = qRedis.getnpush(block=True, timeout=30) #sBuildTarget = qRedis.peek() # debug(sBuildTarget) if sBuildTarget: if dConfig['debug']: debug('func: processBuildTargets() sBuildTarget:', sBuildTarget) dBuildTarget = json.loads(sBuildTarget) # initial setup #if 'projectName' not in dBuiltTarget: continue dArgs['projectName'] = dBuildTarget['projectName'] if dArgs['projectName'] in sExistingBuilds: warning('func: processBuildTargets() project:', dArgs['projectName'], ' already built... skipping...') continue #sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus') #sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath) #dArgs['buildPath'] = sProjectPath dArgs['targets'] = dBuildTarget['targets'] if dConfig['debug']: debug('func: processBuildTargets() targets:', json.dumps(dArgs['targets'], indent=4)) dArgs['containerId'] = str(iContainerId) dArgs[ 'containerName'] = dConfig['containerImage'] + '-' + dArgs[ 'containerOS'] + '-' + dConfig['hostname'] + '_' + str( iContainerId) dArgs['dirs'] = {} dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'], dArgs['containerName']) for sDir in dArgs['containerDirs']: dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'], sDir) # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3) debug('projectPath: ', dBuildTarget['projectPath']) if "_8tof" in dBuildTarget['projectPath']: sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus_8tof') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sProjectPath) sProjectPath = os.path.join('/data/corpus_8tof', sProjectPath) if "_0to7" in dBuildTarget['projectPath']: sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus_0to7') sBuildPath = os.path.join('/data/builder_SAN/outputCyber', sProjectPath) sProjectPath = os.path.join('/data/corpus_0to7', sProjectPath) debug('projectPathDone: ', sProjectPath) dArgs['buildPath'] = sBuildPath ''' # determine code root in project directory sCodePath = dBuildTarget['buildTargetPath'] if sCodePath.startswith('./'): sCodePath = dBuildTarget['buildTargetPath'][2:] sCodeRoot = sCodePath[:sCodePath.index(os.sep)] if os.sep in sCodePath else sCodePath ''' plist = sProjectPath.split('/') uuid = plist[len(plist) - 1] tar = uuid + ("_code.tgz") debug('tarball: ', tar) dArgs['projectPath'] = os.path.join(sProjectPath, tar) # add code root to project path # if dBuildTarget['codeDir']: # print('none') #dArgs['projectPath'] = os.path.join(sProjectPath, dBuildTarget['codeDir']) # else: # warning('func: processBuildTargets() encountered project:', dBuildTarget['projectName'], ' with empty or NULL codeDir which is not supported. Project build skipped...') # continue sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') dArgs['jsonName'] = 'build-' + sTimeStamp + '.json' dArgs['tarName'] = dArgs[ 'projectName'] + '-' + sTimeStamp + '.tgz' dArgs['version'] = dBuildTarget['version'] # setup container makeDirs(dArgs=dArgs, bDebug=dConfig['debug']) copySource(dArgs=dArgs, bDebug=dConfig['debug']) copyScripts(dArgs=dArgs, bDebug=dConfig['debug']) createBuildPlanScript(dArgs=dArgs, bDebug=dConfig['debug']) recordProjectName(dArgs=dArgs, bDebug=dConfig['debug']) startBuild(dArgs=dArgs, bDebug=dConfig['debug']) # sleep until build completes while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']): if dConfig['debug']: debug( 'func: processBuildTargets() build not completed... sleeping' ) time.sleep(10) # get container logs getBuildLogs(dArgs=dArgs, bDebug=dConfig['debug']) # get build output dBuffer = parseBuildOutput(dArgs=dArgs, bDebug=dConfig['debug']) # index build output postBuildStatusUpdates(dArgs=dArgs, dBuffer=dBuffer, dConfig=dConfig) # archive build artifacts tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug']) # remove container removeContainer(dArgs=dArgs, bDebug=dConfig['debug']) # remove project from "building" queue # qRedis.done(value=sBuildTarget) iCtr += 1 if dConfig['debug'] and iCtr >= 1: break else: break if dConfig['debug']: debug( 'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty' ) debug('func: processBuildTargets() redis queue size:', qRedis.size()) debug('func: processBuildTargets() exiting...') except Exception as e: warning('Caught exception in worker thread:', iContainerId) traceback.print_exc() raise e
def processBuildTargets(tTup): (iContainerId, dArgs, dConfig) = tTup # dual queues -- primary for getting what project to build next, secondary to mark what is being built qRedis = RedisQueue(name=dConfig['redis-queue-to-build'], name2=dConfig['redis-queue-building'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) iCtr = 0 while 1: sBuildTarget = qRedis.getnpush(block=True, timeout=30) #sBuildTarget = qRedis.peek() # debug(sBuildTarget) if sBuildTarget: if dConfig['debug']: debug('func: processBuildTargets() sBuildTarget:', sBuildTarget) dBuildTarget = json.loads(sBuildTarget) # initial setup sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus') sProjectPath = os.path.join('/nfsbuild/nfsbuild', sProjectPath) dArgs['buildPath'] = sProjectPath dArgs['buildTargetPath'] = dBuildTarget['buildTargetPath'] dArgs['buildType'] = dConfig['search-strings'][os.path.basename( dArgs['buildTargetPath'])] if dConfig['debug']: debug('func: processBuildTargets() dArgs[\'buildType\']:', dArgs['buildType']) dArgs['containerId'] = str(iContainerId) dArgs['containerName'] = dConfig['containerImage'] + '-' + dArgs[ 'containerOS'] + '-' + dArgs['buildType'] + '-' + dConfig[ 'hostname'] + '_' + str(iContainerId) dArgs['dirs'] = {} dArgs['dirs']['root'] = os.path.join(dConfig['containerPath'], dArgs['containerName']) for sDir in dArgs['containerDirs']: dArgs['dirs'][sDir] = os.path.join(dArgs['dirs']['root'], sDir) dArgs['projectName'] = dBuildTarget['projectName'] # /data/corpus on muse2 is mounted under /nfscorpus/nfscorpus on all 3 servers (via mount-bind on muse2 and NFS on muse1 and muse3) sProjectPath = os.path.relpath(dBuildTarget['projectPath'], '/data/corpus') sProjectPath = os.path.join('/nfscorpus/nfscorpus', sProjectPath) dArgs['projectPath'] = os.path.join(sProjectPath, 'latest') sTimeStamp = datetime.datetime.now().strftime('%Y%m%dT%H%M%S') dArgs['tarName'] = dArgs['projectName'] + '-' + sTimeStamp + '.tgz' dArgs['version'] = dBuildTarget['version'] # setup container makeDirs(dArgs=dArgs, bDebug=dConfig['debug']) copySource(dArgs=dArgs, bDebug=dConfig['debug']) copyScripts(dArgs=dArgs, bDebug=dConfig['debug']) recordProjectName(dArgs=dArgs, bDebug=dConfig['debug']) startBuild(dArgs=dArgs, bDebug=dConfig['debug']) # sleep until build completes while pollBuild(dArgs=dArgs, bDebug=dConfig['debug']): if dConfig['debug']: debug( 'func: processBuildTargets() build not completed... sleeping' ) time.sleep(10) # get build output dBuffer = parseBuildOutput(dArgs=dArgs, bDebug=dConfig['debug']) # index build output postBuildStatusUpdates(dArgs=dArgs, dBuffer=dBuffer, dConfig=dConfig) # archive build artifacts tarUpContainerDirs(dArgs=dArgs, bDebug=dConfig['debug']) # remove container removeContainer(dArgs=dArgs, bDebug=dConfig['debug']) # remove project from "building" queue # qRedis.done(value=sBuildTarget) iCtr += 1 if dConfig['debug'] and iCtr >= 10: break else: break if dConfig['debug']: debug( 'func: processBuildTargets() sBuildTarget is either empty or none, likely since the redis queue is empty' ) debug('func: processBuildTargets() redis queue size:', qRedis.size()) debug('func: processBuildTargets() exiting...')
def queueUpSourceTargets(dConfig): if dConfig['mysql'] and dConfig['redis']: dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) # setup to-build queue qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) dMp.open() # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode iProjectCount = 0 iTargetCount = 0 iMultiTargets = 0 sLimitClause = '' if dConfig['debug']: sLimitClause = '10' lLeadingPaths = [] dProject = {} dCodeDirLookup = {} lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug']) for tProjectRow in lProjectRows: (sProjectName, sCodeDir) = tProjectRow dCodeDirLookup[sProjectName] = sCodeDir lTargetRows = [] if dConfig['unBuiltProjectsOnly']: if dConfig['queueSite']: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: if dConfig['queueSite']: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) else: lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) dMp.close() for tTargetRow in lTargetRows: dTarget = {} (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) ) if sFileExt: sFileExt = sFileExt.lower() if sFileExt in dConfig['source-targets'].keys(): dTarget['buildType'] = dConfig['source-targets'][sFileExt] (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath']) # NATE remove leading tarball from path sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath) dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget) # NATE added to grab code directory from buildTargetPath bPath=sLeadingPath.split('/') if len(bPath) > 1 : codedir2=bPath[0] iTargetCount += 1 if 'projectName' in dProject : if dProject['projectName'] != sProjectName: # new project encountered, push old project onto queue if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4)) qRedis.put(json.dumps(dProject)) iProjectCount += 1 if len(lLeadingPaths) > 1: iMultiTargets += 1 dProject = { 'projectName': sProjectName, 'projectPath': sProjectPath, 'version': dConfig['version'], 'targets': [ dTarget ], 'codeDir': codedir2 #'codeDir': dCodeDirLookup[sProjectName] } lLeadingPaths = [ sLeadingPath ] else: if sLeadingPath not in lLeadingPaths: dProject['targets'].append(dTarget) lLeadingPaths.append(sLeadingPath) else: iTargetCount += -1 if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:', sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4)) else: dProject = { 'projectName': sProjectName, 'projectPath': sProjectPath, 'version': dConfig['version'], 'targets': [ dTarget ], 'codeDir': dCodeDirLookup[sProjectName] } lLeadingPaths = [ sLeadingPath ] else: warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName) else: warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName) if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4)) qRedis.put(json.dumps(dProject)) iProjectCount += 1 if len(lLeadingPaths) > 1: iMultiTargets += 1 printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued') printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
def processProjects(dConfig): qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() lProjects = [] iCount = 0 while 1: sRoot = qRedis.get(block=True, timeout=30) if sRoot: dProject = { '_index': dConfig['es-project-index-name'], '_type': dConfig['es-project-index-type'], '_source': {} } dProject['_id'] = os.path.basename(sRoot) dProject['_source']['name'] = os.path.basename(sRoot) debug('func: processProjects() projects-root:', sRoot) if dConfig['debug']: debug('func: processProjects() projects-root:', sRoot) debug('func: processProjects() projects _id and _source[name] :', dProject['_id']) debug('func: processProjects() inserting project:', dProject['_source']['name']) if os.path.isfile( os.path.join(sRoot, 'filter.json') ): with open( os.path.join(sRoot, 'filter.json') ) as fProjectFilter: dProjectFilter = json.load(fProjectFilter) if 'hasBytecode' in dProjectFilter and dProjectFilter['hasBytecode'].lower() != 'none': dProject['_source']['bytecode_available'] = True if os.path.isfile( os.path.join(sRoot, 'index.json') ): with open( os.path.join(sRoot, 'index.json') ) as fProjectIndex: dProjectIndex = json.load(fProjectIndex) if dConfig['debug']: debug('func: processProjects() dProjectIndex.keys():', json.dumps(dProjectIndex.keys(), indent=4) ) ''' if 'bytecode_available' in dProjectIndex and dProjectIndex['bytecode_available']: dProject['_source']['bytecode_available'] = True ''' if 'code' in dProjectIndex: dProject['_source']['source'] = True dProject['_source']['codeDir'] = dProjectIndex['code'] if dProject['_source']['codeDir'].startswith('./'): dProject['_source']['codeDir'] = dProject['_source']['codeDir'][len('./'):] if 'site' in dProjectIndex: dProject['_source']['site'] = dProjectIndex['site'] if 'crawler_metadata' in dProjectIndex: for sMetaDataFile in dProjectIndex['crawler_metadata']: if 'languages.json' in sMetaDataFile: sLanguageFile = os.path.join(sRoot, sMetaDataFile) if os.path.isfile(sLanguageFile): with open(sLanguageFile) as fLanguageFile: dLanguageFile = json.load(fLanguageFile) if 'C' in dLanguageFile: dProject['_source']['c'] = dLanguageFile['C'] if 'C++' in dLanguageFile: dProject['_source']['cpp'] = dLanguageFile['C++'] if 'C#' in dLanguageFile: dProject['_source']['csharp'] = dLanguageFile['C#'] if 'Java' in dLanguageFile: dProject['_source']['java'] = dLanguageFile['Java'] if dConfig['debug']: debug('func: findProjects() dLanguageFile:', json.dumps(dLanguageFile, indent=4) ) else: warning('func: processProjects()', 'languages.json file listed in index.json but does not exist for project:', dProject['_source']['name'], 'at listed location:', sLanguageFile) else: warning('func: processProjects()', 'index.json not found for project:', dProject['_source']['name']) lProjects.append(dProject) iCount += 1 if (iCount % dConfig['mysql-bulk-statement-size']) == 0: dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug']) lProjects = [] if dConfig['debug'] and iCount >= 100: break else: break if dConfig['mysql']: if len(lProjects) > 0: dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug']) lProjects = [] dMp.close() return lProjects
def findSourceTargets(dConfig): # setup mysql dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # purge source targets queue qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) lProjectRows = dMp.select(sSelectClause='projectName', sTable='cProjectsWithNoBuildTargets', bDebug=dConfig['debug']) dMp.close() debug('func: findSourceTargets() # of c projects without build targets:', len(lProjectRows) ) iCtr = 0 for tProjectRow in lProjectRows: iCtr += 1 if dConfig['debug'] and iCtr > 10: break (sProjectName, ) = tProjectRow # debug('func: findBuildFiles() c project name:', sProjectName) ''' dQuery = { "query": { "bool": { "must": [ { "bool": { "should": [ { "regexp": { "file.raw": ".*\.c" } }, { "regexp": { "file.raw": ".*\.cxx" } }, { "regexp": { "file.raw": ".*\.c++" } }, { "regexp": { "file.raw": ".*\.cc" } } ] } }, { "bool": { "should": [ { "match": { "path": "latest/*" } }, { "match": { "path": "content/*"} } ] } }, { "term": { "project-name.raw": sProjectName } } ] } } } ''' ''' dQuery = { "query": { "bool": { "must": [ { "bool": { "should": [ { "term": { "ext.raw": "c" } }, { "term": { "ext.raw": "cc" } }, { "term": { "ext.raw": "cpp" } }, { "term": { "ext.raw": "cxx" } }, { "term": { "ext.raw": "c++" } } ] } }, { "bool": { "should": [ { "match": { "path": "latest/*" } }, { "match": { "path": "content/*"} } ] } }, { "term": { "project-name.raw": sProjectName } } ] } } } ''' dQuery = { "query": { "bool": { "must": [ { "bool": { "should": [ { "term": { "ext.raw": "c" } }, { "term": { "ext.raw": "cpp" } }, { "term": { "ext.raw": "cxx" } }, { "term": { "ext.raw": "c++" } }, { "term": { "ext.raw": "cc" } } ] } }, { "bool": { "should": [ { "match": { "path": "latest/*" } }, { "match": { "path": "content/*"} } ] } }, { "term": { "project-name.raw": sProjectName } } ] } } } qRedis.put( json.dumps(dQuery) )
def main(argv): # defaults bError = False dConfig = {} dConfig['debug'] = False dConfig['forks'] = 5 dConfig['mysql-db'] = 'muse' dConfig['mysql-user'] = '******' dConfig['mysql-passwd'] = 'muse' dConfig['mysql-loc'] = 'muse2-int' dConfig['mysql-port'] = 54321 dConfig['mysql'] = True dConfig['redis-queue-json'] = 'muse-json' dConfig['redis-set'] = 'muse-projects' dConfig['redis-loc'] = 'muse2-int' # dConfig['redis-port'] = '6379' dConfig['redis-port'] = '12345' dConfig['redis'] = True ### command line argument handling options, remainder = getopt.getopt(sys.argv[1:], 'f:d', ['forks=', 'debug']) # debug('func: main()', 'options:', options) # debug('func: main()', 'remainder:', remainder) for opt, arg in options: if opt in ('-f', '--forks'): try: dConfig['forks'] = int(arg) except ValueError as e: bError = True elif opt in ('-d', '--debug'): dConfig['debug'] = True debug('func: main()', 'dConfig:', json.dumps(dConfig, indent=4)) if bError: usage() else: iStart = time.time() # prepare redis queue for producer, flush queue before starting the producer qRedis = RedisQueue(dConfig['redis-queue-json'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) qRedis.flush() ''' # multi-process approach # call producer process that populates redis queue with project path roots pProducer = multiprocessing.Process( target=createBuildSummaries, args=(dConfig) ) pProducer.start() ### setup json writers lConsumerArgs = [] for iCtr in range(0, dConfig['forks']): lConsumerArgs.append( (dConfig) ) # create pool of workers oConsumerPool = multiprocessing.Pool(processes=dConfig['forks']) ### do work -- use pool of workers to search for each search string in muse-corpus-source es index oConsumerPool.map(writeBuildSummaries, lConsumerArgs) # wait for the producer to complete pProducer.join() # wait for the consumer pool to complete oConsumerPool.close() oConsumerPool.join() ''' ''' # single process approach: ''' createBuildSummaries(dConfig) writeBuildSummaries(dConfig) if dConfig['debug']: debug('func: main()', "all processes completed") iEnd = time.time() printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def main(argv): # defaults bError = False dConfig = {} dConfig['containerImage'] = 'musebuilder' dConfig['containerPath'] = '/data/builder' dConfig['debug'] = False dConfig['elasticsearch'] = True dConfig['es-instance-locs'] = ['muse1-int', 'muse2-int', 'muse3-int'] #dConfig['es-instance-locs'] = ['muse2-int','muse3-int'] #dConfig['es-instance-locs'] = ['muse3-int'] #dConfig['es-file-index-name'] = 'muse-corpus-source' dConfig['es-file-index-name'] = 'muse-corpus-build' dConfig['es-file-index-type'] = 'muse-project-build' dConfig['forks'] = 5 dConfig['hostname'] = socket.gethostname().replace('.', '') dConfig['mysql-db'] = 'muse' dConfig['mysql-user'] = '******' dConfig['mysql-passwd'] = 'muse' dConfig['mysql-loc'] = 'muse2-int' dConfig['mysql-port'] = 54321 dConfig['mysql'] = True dConfig['os'] = 'ubuntu14' dConfig['redis-queue-to-build'] = 'muse-to-build' dConfig['redis-queue-building'] = 'muse-building' dConfig['redis-loc'] = 'muse2-int' # dConfig['redis-port'] = '6379' dConfig['redis-port'] = '12345' dConfig['redis'] = True dConfig['search-strings'] = { 'configure': 'configureBuildType', 'configure.ac': 'configureacBuildType', 'configure.in': 'configureinBuildType', 'CMakeLists.txt': 'cmakeBuildType', 'Makefile': 'makefileBuildType' #'build.xml' : 'antBuildType', #'pom.xml' : 'mavenBuildType' } dArgs = {} dArgs['buildScripts'] = {} dArgs['buildScripts']['root'] = '/managed/scripts' dArgs['buildScripts']['loader'] = os.path.join( dArgs['buildScripts']['root'], 'runBuild.sh') dArgs['buildScripts']['cmakeBuildType'] = os.path.join( dArgs['buildScripts']['root'], 'cmake.sh') dArgs['buildScripts']['configureBuildType'] = os.path.join( dArgs['buildScripts']['root'], 'configure.sh') dArgs['buildScripts']['configureacBuildType'] = os.path.join( dArgs['buildScripts']['root'], 'configureac.sh') dArgs['buildScripts']['configureinBuildType'] = os.path.join( dArgs['buildScripts']['root'], 'configurein.sh') dArgs['buildScripts']['makefileBuildType'] = os.path.join( dArgs['buildScripts']['root'], 'make.sh') dArgs['containerDirs'] = ['buildArtifacts', 'output', 'scripts', 'source'] dArgs['containerOS'] = 'ubuntu14' dArgs['containerPath'] = dConfig['containerPath'] dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs['containerOS'] dArgs['script-name'] = 'build.sh' lSupportedOSs = ['ubuntu12', 'ubuntu14'] ### command line argument handling options, remainder = getopt.getopt( sys.argv[1:], 'f:q:o:d', ['forks=', 'queue-projects=', 'os=', 'debug']) # debug('func: main()', 'options:', options) # debug('func: main()', 'remainder:', remainder) for opt, arg in options: if opt in ('-f', '--forks'): try: dConfig['forks'] = int(arg) except ValueError as e: bError = True elif opt in ('-o', '--os'): if arg in lSupportedOSs: dArgs['containerOS'] = arg dArgs['imageName'] = dConfig['containerImage'] + '-' + dArgs[ 'containerOS'] else: bError = True elif opt in ('-d', '--debug'): dConfig['debug'] = True if bError: usage() else: # pre-initialization -- if projects remained in building queue, put them back in queue-to-build qToBuildRedis = RedisQueue(name=dConfig['redis-queue-building'], name2=dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) for iCtr in range(0, len(qToBuildRedis)): qToBuildRedis.getnpush() iStart = time.time() ### setup consumers lConsumerArgs = [] # create a locking semaphore for mutex lock = multiprocessing.Lock() for iCtr in range(0, dConfig['forks']): lConsumerArgs.append((iCtr, dArgs, dConfig)) # create pool of workers -- number of workers equals the number of search strings to be processed oConsumerPool = multiprocessing.Pool(processes=dConfig['forks'], initializer=initialize_lock, initargs=(lock, )) ### do work -- use pool of workers to search for each search string in muse-corpus-source es index oConsumerPool.map(processBuildTargets, lConsumerArgs) oConsumerPool.close() oConsumerPool.join() #processBuildTargets( (0, dArgs, dConfig) ) if dConfig['debug']: debug('func: main()', "all processes completed") iEnd = time.time() printMsg('func: main()', 'execution time:', (iEnd - iStart), 'seconds')
def createBuildSummaries(dConfig): qRedis = RedisQueue(dConfig['redis-queue-json'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) dMp = MuseProjectDB(db=dConfig['mysql-db'], port=dConfig['mysql-port'], user=dConfig['mysql-user'], passwd=dConfig['mysql-passwd'], loc=dConfig['mysql-loc']) sLimitClause = '' if dConfig['debug']: sLimitClause = '10' dReturnCodeLookup = { 'buildSuccess': 'success', 'buildPartial': 'partial', 'buildFail': 'fail' } sSelectClause = 'projectName,projectPath,buildTarPath,buildTime,version,os,numObjectsPreBuild,numObjectsPostBuild,numObjectsGenerated,numSources,buildTargetPath,configureBuildType,configureacBuildType,configureinBuildType,cmakeBuildType,makefileBuildType,antBuildType,mavenBuildType,returnCode' lTargetTypes = [ 'configureBuildType', 'configureacBuildType', 'configureinBuildType', 'cmakeBuildType', 'makefileBuildType', 'antBuildType', 'mavenBuildType' ] dMp.open() iProjectCount = 0 dProjects = { 'success': RedisSet(dConfig['redis-set'] + '-success', namespace='set', host=dConfig['redis-loc'], port=dConfig['redis-port']), 'partial': RedisSet(dConfig['redis-set'] + '-partial', namespace='set', host=dConfig['redis-loc'], port=dConfig['redis-port']), 'fail': RedisSet(dConfig['redis-set'] + '-fail', namespace='set', host=dConfig['redis-loc'], port=dConfig['redis-port']) } for sTable, sProjectBin in dReturnCodeLookup.iteritems(): # empty redis set dProjects[sProjectBin].flush() lProjects = dMp.select(sSelectClause='projectName', sTable=sTable, sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug']) # populate redis set with projects of each bin type for tProject in lProjects: (sProjectName, ) = tProject dProjects[sProjectBin].put(sProjectName) dProjectSummary = {} lTargetRows = dMp.select(sSelectClause=sSelectClause, sTable='buildStatusWithTargets', sOrderByClause='projectName,buildTarPath', sLimitClause=sLimitClause, bDebug=dConfig['debug']) for tTargetRow in lTargetRows: dTarget = {} (dTarget['projectName'], dTarget['projectPath'], dTarget['buildTarPath'], dTarget['buildTime'], dTarget['version'], dTarget['os'], dTarget['numObjectsPreBuild'], dTarget['numObjectsPostBuild'], dTarget['numObjectsGenerated'], dTarget['numSources'], dTarget['buildTargetPath'], dTarget['configureBuildType'], dTarget['configureacBuildType'], dTarget['configureinBuildType'], dTarget['cmakeBuildType'], dTarget['makefileBuildType'], dTarget['antBuildType'], dTarget['mavenBuildType'], dTarget['returnCode']) = tTargetRow if dProjectSummary: if dProjectSummary['projectName'] == dTarget['projectName']: try: (dBuild for dBuild in dProjectSummary['builds'] if dBuild['buildTarPath'] == dTarget['buildTarPath'] ).next() except (StopIteration) as e: dBuild = { 'buildTarPath': dTarget['buildTarPath'], 'buildTime': dTarget['buildTime'], 'version': dTarget['version'], 'os': dTarget['os'], 'numObjectsPreBuild': dTarget['numObjectsPreBuild'], 'numObjectsPostBuild': dTarget['numObjectsPostBuild'], 'numObjectsGenerated': dTarget['numObjectsGenerated'], 'numSources': dTarget['numSources'], 'targets': [] } dProjectSummary['builds'].append(dBuild) dTargetSummary = { 'buildTargetPath': dTarget['buildTargetPath'], 'returnCode': dTarget['returnCode'] } for sTargetType in lTargetTypes: if dTarget[sTargetType] == 1: dTargetSummary['target-type'] = sTargetType break dBuild['targets'].append(dTargetSummary) else: if dConfig['debug']: debug('func: createBuildSummaries() dProjectSummary:', json.dumps(dProjectSummary, indent=4)) qRedis.put(json.dumps(dProjectSummary)) iProjectCount += 1 dProjectSummary = {} if not dProjectSummary: # project specific build summary info dBuild = { 'buildTarPath': dTarget['buildTarPath'], 'buildTime': dTarget['buildTime'], 'version': dTarget['version'], 'os': dTarget['os'], 'numObjectsPreBuild': dTarget['numObjectsPreBuild'], 'numObjectsPostBuild': dTarget['numObjectsPostBuild'], 'numObjectsGenerated': dTarget['numObjectsGenerated'], 'numSources': dTarget['numSources'], 'targets': [] } dProjectSummary = { 'projectName': dTarget['projectName'], 'sourcePath': dTarget['projectPath'], 'builds': [dBuild] } if dTarget['projectName'] in dProjects['success']: dProjectSummary['buildStatus'] = 'success' elif dTarget['projectName'] in dProjects['partial']: dProjectSummary['buildStatus'] = 'partial' elif dTarget['projectName'] in dProjects['fail']: dProjectSummary['buildStatus'] = 'fail' # target specific build summary info dTargetSummary = { 'buildTargetPath': dTarget['buildTargetPath'], 'returnCode': dTarget['returnCode'] } for sTargetType in lTargetTypes: if dTarget[sTargetType] == 1: dTargetSummary['target-type'] = sTargetType break dBuild['targets'].append(dTargetSummary) if dProjectSummary: if dConfig['debug']: debug('func: createBuildSummaries() dProjectSummary:', json.dumps(dProjectSummary, indent=4)) qRedis.put(json.dumps(dProjectSummary)) iProjectCount += 1 dProjectSummary = {} dMp.close() printMsg('func: createBuildSummaries()', str(iProjectCount), 'projects queued')
def indexSourceTargets(dConfig): # setup mysql client dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc']) dMp.open() # setup elasticsearch client oES = Elasticsearch(dConfig['es-instance-locs']) # setup source targets queue qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port']) while 1: sQuery = qRedis.get(block=True, timeout=30) if sQuery: dQuery = json.loads(sQuery) if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) lSourceFiles = [] # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False) sScrollId = dResponse['_scroll_id'] if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse) if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total']) #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0: while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0: dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m') sScrollId = dResponse['_scroll_id'] if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0): if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) ) for dHit in dResponse['hits']['hits']: # found matches try: if '_source' in dHit: # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) ) #NATE added, remove leading path from found built targets mBuildTarget=dHit['_source']['file']; mBuildTarget=mBuildTarget.split('/') dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1] dProjectFound = {} lSourceTypes = dMp.getSourceTypes() for sSourceType in lSourceTypes: dProjectFound[sSourceType] = False if 'file' in dHit['_source'] and dHit['_source']['file']: (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) if sFileExt.lower() in dConfig['source-targets'].keys(): dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True else: warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) ) continue if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name'] if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path'] if 'path' in dHit['_source'] and dHit['_source']['path']: dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] ) # debug('func findSourceFileHelper()', json.dumps(dProjectFound)) lSourceFiles.append(dProjectFound) # causing es reads to time out if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']: dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug']) printMsg('func indexSourceTargets() loaded', iCtr, 'source targets') lSourceFiles = [] except (UnicodeDecodeError, UnicodeEncodeError) as e: warning('func indexSourceTargets() encountered exception:', e) #warning('func indexSourceTargets() with string: ', dHit['_source']['path']) warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) ) else: break if (len(lSourceFiles) > 0) and dConfig['mysql']: dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug']) lSourceFiles = [] else: break dMp.close()