def queueUpSourceTargets(dConfig):

    if dConfig['mysql'] and dConfig['redis']:

        dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
        
        # setup to-build queue
        qRedis = RedisQueue(dConfig['redis-queue-to-build'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])
        
        dMp.open()

        # get projects first to iterate through (makes it easier to build project specific dictionaries), limit if in debug mode
        iProjectCount = 0
        iTargetCount = 0
        iMultiTargets = 0

        sLimitClause = ''

        if dConfig['debug']: sLimitClause = '10'
        
        lLeadingPaths = []
        
        dProject = {}

        dCodeDirLookup = {}
        lProjectRows = dMp.select(sSelectClause='projectName,codeDir', sTable='availableProjects', bDebug=dConfig['debug'])
        for tProjectRow in lProjectRows:

            (sProjectName, sCodeDir) = tProjectRow
            dCodeDirLookup[sProjectName] = sCodeDir
        
        lTargetRows = []

        if dConfig['unBuiltProjectsOnly']:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='unBuiltSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        else:

            if dConfig['queueSite']:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargetsWithSite', sWhereClause='site=\'' + dConfig['queueSite'] + '\'', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

            else:

                lTargetRows = dMp.select(sSelectClause='projectName,projectPath,buildTargetPath', sTable='availableSourceTargets', sOrderByClause='projectName', sLimitClause=sLimitClause, bDebug=dConfig['debug'])

        dMp.close()

        for tTargetRow in lTargetRows:

            dTarget = {}

            (sProjectName, sProjectPath, dTarget['buildTargetPath'], ) = tTargetRow

            (_, sFileExt) = os.path.splitext( os.path.basename(dTarget['buildTargetPath']) )

            if sFileExt:

                sFileExt = sFileExt.lower()

                if sFileExt in dConfig['source-targets'].keys():

                    dTarget['buildType'] = dConfig['source-targets'][sFileExt]

                    (sLeadingPath, sTarget) = os.path.split(dTarget['buildTargetPath'])

                    # NATE remove leading tarball from path
                    sLeadingPath = re.sub(r'[a-zA-Z_0-9-_]*.tgz/', "", sLeadingPath)
                    dTarget['buildTargetPath'] = os.path.join(sLeadingPath, sTarget)

                    # NATE added to grab code directory from buildTargetPath
                    bPath=sLeadingPath.split('/')
                    if len(bPath) > 1 :
                        codedir2=bPath[0]

                    iTargetCount += 1

                    if 'projectName' in dProject :

                        if dProject['projectName'] != sProjectName:

                            # new project encountered, push old project onto queue
                            if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))
                            qRedis.put(json.dumps(dProject))
                            iProjectCount += 1
                            if len(lLeadingPaths) > 1:
                                iMultiTargets += 1

                            dProject = {
                                'projectName': sProjectName,
                                'projectPath': sProjectPath,
                                'version': dConfig['version'],
                                'targets': [ dTarget ],
                                'codeDir': codedir2
                                #'codeDir': dCodeDirLookup[sProjectName]
                            }

                            lLeadingPaths = [ sLeadingPath ]

                        else:

                            if sLeadingPath not in lLeadingPaths:

                                dProject['targets'].append(dTarget)
                                lLeadingPaths.append(sLeadingPath)

                            else: 

                                iTargetCount += -1
                                if dConfig['debug']: debug('func: queueUpSourceTargets() already encountered path:',  sLeadingPath, 'not adding:', json.dumps(dTarget, indent=4))

                    else:

                        dProject = {
                            'projectName': sProjectName,
                            'projectPath': sProjectPath,
                            'version': dConfig['version'],
                            'targets': [ dTarget ],
                            'codeDir': dCodeDirLookup[sProjectName]
                        }

                        lLeadingPaths = [ sLeadingPath ]

                else:

                    warning('func: queueUpSourceTargets() unknown C/C++ file extension encountered:', sFileExt, 'file-path:',dTarget['buildTargetPath'],'for project:', sProjectName)

            else:

                warning('func: queueUpSourceTargets() missing file extension encountered file-path:') #,dTarget['buildTargetPath'],'for project:', sProjectName)


        if dConfig['debug']: debug('func: queueUpSourceTargets() queuing project:', json.dumps(dProject, indent=4))

        qRedis.put(json.dumps(dProject))
        iProjectCount += 1        
        if len(lLeadingPaths) > 1:
            iMultiTargets += 1

        printMsg('func: queueUpSourceTargets()', str(iProjectCount), 'projects queued', str(iTargetCount), 'targets queued', str(iMultiTargets), 'multi-target projects queued')
        printMsg('func: queueUpSourceTargets()', qRedis.size(), 'projects reported by redis')
def processProjects(dConfig):

    qRedis = RedisQueue(dConfig['redis-queue-project-paths'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    lProjects = []

    iCount = 0

    while 1:

        sRoot = qRedis.get(block=True, timeout=30)

        if sRoot:

            dProject = {
                '_index': dConfig['es-project-index-name'],
                '_type': dConfig['es-project-index-type'],
                '_source': {}
            }

            dProject['_id'] = os.path.basename(sRoot)
            dProject['_source']['name'] = os.path.basename(sRoot)
            debug('func: processProjects() projects-root:', sRoot) 

            if dConfig['debug']: 

                debug('func: processProjects() projects-root:', sRoot) 
                debug('func: processProjects() projects _id and _source[name] :', dProject['_id']) 
                debug('func: processProjects() inserting project:', dProject['_source']['name'])

            if os.path.isfile( os.path.join(sRoot, 'filter.json') ):

                with open( os.path.join(sRoot, 'filter.json') ) as fProjectFilter:

                    dProjectFilter = json.load(fProjectFilter)

                    if 'hasBytecode' in dProjectFilter and dProjectFilter['hasBytecode'].lower() != 'none':
                        dProject['_source']['bytecode_available'] = True

            if os.path.isfile( os.path.join(sRoot, 'index.json') ):

                with open( os.path.join(sRoot, 'index.json') ) as fProjectIndex:
            
                    dProjectIndex = json.load(fProjectIndex)

                    if dConfig['debug']: debug('func: processProjects() dProjectIndex.keys():', json.dumps(dProjectIndex.keys(), indent=4) )

                    '''
                    if 'bytecode_available' in dProjectIndex and dProjectIndex['bytecode_available']:

                        dProject['_source']['bytecode_available'] = True
                    '''
                    if 'code' in dProjectIndex:

                        dProject['_source']['source'] = True
                        dProject['_source']['codeDir'] = dProjectIndex['code']

                        if dProject['_source']['codeDir'].startswith('./'):

                            dProject['_source']['codeDir'] = dProject['_source']['codeDir'][len('./'):]

                    if 'site' in dProjectIndex:

                        dProject['_source']['site'] = dProjectIndex['site']

                    if 'crawler_metadata' in dProjectIndex:

                        for sMetaDataFile in dProjectIndex['crawler_metadata']:

                            if 'languages.json' in sMetaDataFile:

                                sLanguageFile = os.path.join(sRoot, sMetaDataFile)

                                if os.path.isfile(sLanguageFile):

                                    with open(sLanguageFile) as fLanguageFile:

                                        dLanguageFile = json.load(fLanguageFile)

                                        if 'C' in dLanguageFile: 

                                            dProject['_source']['c'] = dLanguageFile['C']

                                        if 'C++' in dLanguageFile: 

                                            dProject['_source']['cpp'] = dLanguageFile['C++']

                                        if 'C#' in dLanguageFile:

                                            dProject['_source']['csharp'] = dLanguageFile['C#']

                                        if 'Java' in dLanguageFile: 

                                            dProject['_source']['java'] = dLanguageFile['Java']

                                        if dConfig['debug']: debug('func: findProjects() dLanguageFile:', json.dumps(dLanguageFile, indent=4) )
                                else:

                                    warning('func: processProjects()', 'languages.json file listed in index.json but does not exist for project:', dProject['_source']['name'], 'at listed location:', sLanguageFile)

            else:

                warning('func: processProjects()', 'index.json not found for project:', dProject['_source']['name'])

            lProjects.append(dProject)
            
            iCount += 1

            if (iCount % dConfig['mysql-bulk-statement-size']) == 0: 

                dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
                lProjects = []

            if dConfig['debug'] and iCount >= 100: break

        else:

            break

    if dConfig['mysql']:

        if len(lProjects) > 0:

            dMp.insertIntoProjects(lProjects=lProjects, bDebug=dConfig['debug'])
            lProjects = []

        dMp.close()

    return lProjects
def findSourceTargets(dConfig):

    # setup mysql 
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # purge source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    lProjectRows = dMp.select(sSelectClause='projectName', sTable='cProjectsWithNoBuildTargets', bDebug=dConfig['debug'])

    dMp.close()

    debug('func: findSourceTargets() # of c projects without build targets:', len(lProjectRows) )

    iCtr = 0

    for tProjectRow in lProjectRows:

        iCtr += 1

        if dConfig['debug'] and iCtr > 10: break

        (sProjectName, ) = tProjectRow

        # debug('func: findBuildFiles() c project name:', sProjectName)

        '''
        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "regexp": { "file.raw": ".*\.c" } },
                                { "regexp": { "file.raw": ".*\.cxx" } },
                                { "regexp": { "file.raw": ".*\.c++" } },
                                { "regexp": { "file.raw": ".*\.cc" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }
        '''


        '''
        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "term": { "ext.raw": "c" } },
                                { "term": { "ext.raw": "cc" } },
                                { "term": { "ext.raw": "cpp" } },
                                { "term": { "ext.raw": "cxx" } },
                                { "term": { "ext.raw": "c++" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }
        '''

        dQuery = {
            "query": {
                "bool": {
                    "must": [
                        { "bool": {
                            "should": [
                                { "term": { "ext.raw": "c" } },
                                { "term": { "ext.raw": "cpp" } },
                                { "term": { "ext.raw": "cxx" } },
                                { "term": { "ext.raw": "c++" } },
                                { "term": { "ext.raw": "cc" } }
                            ]
                          }
                        },
                        {
                          "bool": {
                            "should": [
                              { "match": { "path": "latest/*" } },
                              { "match": { "path": "content/*"} }
                            ]
                          }
                        },
                        {
                          "term": { "project-name.raw": sProjectName }
                        }
                    ]
                }
            }
        }

        qRedis.put( json.dumps(dQuery) ) 
def findBuildTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'],timeout=180, max_retries=3, retry_on_timeout=True )

    # purge build targets queue -- considering if we need to split mysql ingestion from elasticsearch queries... mysql may benefit from consumer pool inserting statements concurrently
    # qRedis = RedisQueue(dConfig['redis-queue-build-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    lBuildFiles = []

    iCtr = 0

    dQuery = {
        "query": {
            "bool": {
                "must": [
                    { "bool": {
                        "should": [
                            { "wildcard": { "file.raw": "*/configure.ac" } },
                            { "wildcard": { "file.raw": "*/configure.in" } },
                            { "wildcard": { "file.raw": "*/configure" } },
                            { "wildcard": { "file.raw": "*/CMakeLists.txt" } },
                            { "wildcard": { "file.raw": "*/Makefile" } }
                        ]
                      }
                    },
                    {
                      "bool": {
                        "should": [
                          { "match": { "path": "latest/*" } },
                          { "match": { "path": "content/*"} }
                        ]
                      }
                    },
                    {"wildcard":{"file.raw": "/data/corpus_8tof/*"}}
                ]
            }
        }
    }

    if dConfig['debug']: debug( 'func: findBuildFiles() dQuery:', json.dumps(dQuery) ) 

    # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

    dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False, request_timeout=180,)
    sScrollId = dResponse['_scroll_id']

    if dConfig['debug']: debug('func: findBuildFiles() (after initial search) dResponse: ', dResponse)

    if dConfig['debug']: debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])
    debug('func: findBuildFiles() search hits: ', dResponse['hits']['total'])

    while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

        dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

        sScrollId = dResponse['_scroll_id']

        if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

            if dConfig['debug']: debug('func: findBuildFiles() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

            if dConfig['debug'] and iCtr > 10: break

            for dHit in dResponse['hits']['hits']:

                iCtr += 1

                if dConfig['debug'] and iCtr > 10: break

                # found matches

                try:

                    if '_source' in dHit:

			#NATE added, remove leading path from found built targets
                        mBuildTarget=dHit['_source']['file'];
                        mBuildTarget=mBuildTarget.split('/')
                        dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                        dProjectFound = {}

                        # initialize all build target types to false
                        lBuildTypes = dMp.getBuildTypes()
                        for sBuildType in lBuildTypes:

                            dProjectFound[sBuildType] = False

                        # mark relevant build target type true
                        if 'file' in dHit['_source'] and dHit['_source']['file'] and dHit['_source']['file'] in dConfig['build-targets'].keys():

                            if dConfig['debug']: debug('func findBuildFiles() returned build target:', dHit['_source']['file'])

                            dProjectFound[ dConfig['build-targets'][ dHit['_source']['file'] ]['type'] ] = True
                            dProjectFound['ranking'] = dConfig['build-targets'][ dHit['_source']['file'] ]['ranking']

                        else: 

                            warning( 'func findBuildFiles() es returned an improper build target:', json.dumps(dHit['_source']) )
                            continue

                        if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                        if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                        if 'path' in dHit['_source'] and dHit['_source']['path']: 

                            dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )
                            dProjectFound['depth'] = depth( dProjectFound['buildTargetPath'] )

                        # debug('func findBuildFiles()', json.dumps(dProjectFound))

                        lBuildFiles.append(dProjectFound)

                        # causing es reads to time out
    
                        if (len(lBuildFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                            printMsg('func findBuildFiles() loaded', iCtr, 'build targets')

                            lBuildFiles = []

                except (UnicodeDecodeError, UnicodeEncodeError) as e:
                    
                    warning('func findBuildFiles() encountered exception:', e)
                    #warning('func findBuildFiles() with string: ', dHit['_source']['path'])
                    warning('func findBuildFiles() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

        else:

            break

        if (len(lBuildFiles) > 0) and dConfig['mysql']:

            dMp.insertIntoBuildTargets(lTargets=lBuildFiles, bDebug=dConfig['debug'])
                
            lBuildFiles = []

    dMp.close()
def indexSourceTargets(dConfig):

    # setup mysql client
    dMp = MuseProjectDB(db=dConfig['mysql-db'],port=dConfig['mysql-port'],user=dConfig['mysql-user'],passwd=dConfig['mysql-passwd'],loc=dConfig['mysql-loc'])
    dMp.open()

    # setup elasticsearch client
    oES = Elasticsearch(dConfig['es-instance-locs'])

    # setup source targets queue
    qRedis = RedisQueue(dConfig['redis-queue-source-targets'], namespace='queue', host=dConfig['redis-loc'], port=dConfig['redis-port'])

    while 1:

        sQuery = qRedis.get(block=True, timeout=30)

        if sQuery:

            dQuery = json.loads(sQuery)

            if dConfig['debug']: debug( 'func: indexSourceTargets() dQuery:', json.dumps(dQuery) ) 

            lSourceFiles = []

            # scroll time set to 10 minutes, change as needed -- required for consistent results, the scroll token expires at the end of scroll time

            dResponse = oES.search(index=dConfig['es-file-index-name'], doc_type=dConfig['es-file-index-type'], body=json.dumps(dQuery), search_type='scan', scroll='20m', timeout='20m', lowercase_expanded_terms=False)
            sScrollId = dResponse['_scroll_id']

            if dConfig['debug']: debug('func: indexSourceTargets() (after initial search) dResponse: ', dResponse)

            if dConfig['debug']: debug('func: indexSourceTargets() search hits: ', dResponse['hits']['total'])

            #while not dResponse['timed_out'] and dResponse['hits']['hits']['total'] > 0:
            while 'timed_out' in dResponse and not dResponse['timed_out'] and 'hits' in dResponse and 'total' in dResponse['hits'] and dResponse['hits']['total'] > 0:

                dResponse = oES.scroll(scroll_id=sScrollId, scroll='20m')

                sScrollId = dResponse['_scroll_id']

                if ('hits' in dResponse['hits']) and (len(dResponse['hits']['hits']) > 0):

                    if dConfig['debug']: debug('func: indexSourceTargets() scroll_id:', sScrollId, 'number of hits:', len(dResponse['hits']['hits']) )

                    for dHit in dResponse['hits']['hits']:

                        # found matches

                        try:

                            if '_source' in dHit:

                                # debug('func: indexSourceTargets() dHit:', json.dumps(dHit['_source']) )
                                #NATE added, remove leading path from found built targets
                                mBuildTarget=dHit['_source']['file'];
                                mBuildTarget=mBuildTarget.split('/')
                                dHit['_source']['file'] = mBuildTarget[len(mBuildTarget)-1]

                                dProjectFound = {}

                                lSourceTypes = dMp.getSourceTypes()
                                for sSourceType in lSourceTypes:

                                    dProjectFound[sSourceType] = False

                                if 'file' in dHit['_source'] and dHit['_source']['file']:

                                    (sFileName, sFileExt) = os.path.splitext(dHit['_source']['file']) 

                                    if sFileExt.lower() in dConfig['source-targets'].keys():

                                        dProjectFound[ dConfig['source-targets'][ sFileExt.lower() ] ] = True

                                else: 

                                    warning( 'func indexSourceTargets() es returned an improper source target:', json.dumps(dHit['_source']) )
                                    continue

                                if 'project-name' in dHit['_source'] and dHit['_source']['project-name']: dProjectFound['projectName'] = dHit['_source']['project-name']
                                if 'project-path' in dHit['_source'] and dHit['_source']['project-path']: dProjectFound['projectPath'] = dHit['_source']['project-path']
                                if 'path' in dHit['_source'] and dHit['_source']['path']: 

                                    dProjectFound['buildTargetPath'] = verifyEncoding( dHit['_source']['path'] )

                                # debug('func findSourceFileHelper()', json.dumps(dProjectFound))

                                lSourceFiles.append(dProjectFound)

                                # causing es reads to time out
            
                                if (len(lSourceFiles) > dConfig['mysql-bulk-statement-size']) and dConfig['mysql']:

                                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                                    printMsg('func indexSourceTargets() loaded', iCtr, 'source targets')

                                    lSourceFiles = []

                        except (UnicodeDecodeError, UnicodeEncodeError) as e:
                            
                            warning('func indexSourceTargets() encountered exception:', e)
                            #warning('func indexSourceTargets() with string: ', dHit['_source']['path'])
                            warning('func indexSourceTargets() full _source payload: ', json.dumps( dHit['_source'], indent=4 ) )

                else:

                    break

                if (len(lSourceFiles) > 0) and dConfig['mysql']:

                    dMp.insertIntoSourceTargets(lTargets=lSourceFiles, bDebug=dConfig['debug'])
                        
                    lSourceFiles = []

        else:

            break

    dMp.close()