Exemplo n.º 1
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default' : self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.exitCodes      = getattr(self.config.ErrorHandler, 'failureExitCodes', [])
        self.maxFailTime    = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR       = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes      = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs    = self.daoFactory(classname = "Jobs.GetAllJobs")
        self.idLoad     = self.daoFactory(classname = "Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        return
Exemplo n.º 2
0
    def _getDBSBlock(self, match, wmspec):
        """Get DBS info for this block"""
        blockName = match['Inputs'].keys()[0] #TODO: Allow more than one

        if match['ACDC']:
            acdcInfo = match['ACDC']
            acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
            collection = acdc.getDataCollection(acdcInfo['collection'])
            splitedBlockName = ACDCBlock.splitBlockName(blockName)
            fileLists = acdc.getChunkFiles(acdcInfo['collection'],
                                           acdcInfo['fileset'],
                                           splitedBlockName['Offset'],
                                           splitedBlockName['NumOfFiles'],
                                           user = wmspec.getOwner().get("name"),
                                           group = wmspec.getOwner().get("group"))
            block = {}
            block["Files"] = fileLists
            return blockName, block
        else:
            dbs = get_dbs(match['Dbs'])
            if wmspec.getTask(match['TaskName']).parentProcessingFlag():
                dbsBlockDict = dbs.getFileBlockWithParents(blockName)
            else:
                dbsBlockDict = dbs.getFileBlock(blockName)
        return blockName, dbsBlockDict[blockName]
Exemplo n.º 3
0
    def setUp(self):
        """
        setup for test.
        """
        super(ErrorHandlerTest, self).setUp()
        myThread = threading.currentThread()

        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules=["WMCore.WMBS"],
                                useDefault=False)
        self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC")
        self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump")
        self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump")

        self.daofactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daofactory(classname="Jobs.GetAllJobs")
        self.setJobTime = self.daofactory(classname="Jobs.SetStateTime")
        locationAction = self.daofactory(classname="Locations.New")
        locationAction.execute(siteName="malpaquet", pnn="T2_CH_CERN")
        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
        self.nJobs = 10

        self.dataCS = DataCollectionService(url=self.testInit.couchUrl,
                                            database="errorhandler_t")

        return
Exemplo n.º 4
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        avgEventsPerJob = int(kwargs.get('events_per_job', 5000))
        eventLimit      = int(kwargs.get('max_events_per_lumi', 20000))
        splitOnFile     = bool(kwargs.get('halt_job_on_file_boundaries', True))
        ignoreACDC      = bool(kwargs.get('ignore_acdc_except', False))
        collectionName  = kwargs.get('collectionName', None)
        splitOnRun      = kwargs.get('splitOnRun', True)
        getParents      = kwargs.get('include_parents', False)
        runWhitelist    = kwargs.get('runWhitelist', [])
        runs            = kwargs.get('runs', None)
        lumis           = kwargs.get('lumis', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))
        deterministicPileup = kwargs.get('deterministicPileup', False)
        eventsPerLumiInDataset = 0

        if deterministicPileup and self.package == 'WMCore.WMBS':
            getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow")
            jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id)
            self.nJobs = jobNumber

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL       = kwargs.get('couchURL')
                couchDB        = kwargs.get('couchDB')
                filesetName    = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner          = kwargs.get('owner')
                group          = kwargs.get('group')

                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group)
            except Exception, ex:
                msg =  "Exception while trying to load goodRunList\n"
                if ignoreACDC:
                    msg +=  "Ditching goodRunList\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    goodRunList = {}
                else:
                    msg +=  "Refusing to create any jobs.\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    return
Exemplo n.º 5
0
    def _getDBSBlock(self, match, wmspec):
        """Get DBS info for this block"""
        blockName = match['Inputs'].keys()[0] #TODO: Allow more than one

        if match['ACDC']:
            acdcInfo = match['ACDC']
            acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
            collection = acdc.getDataCollection(acdcInfo['collection'])
            splitedBlockName = ACDCBlock.splitBlockName(blockName)
            fileLists = acdc.getChunkFiles(acdcInfo['collection'],
                                           acdcInfo['fileset'],
                                           splitedBlockName['Offset'],
                                           splitedBlockName['NumOfFiles'],
                                           user = wmspec.getOwner().get("name"),
                                           group = wmspec.getOwner().get("group"))
            block = {}
            block["Files"] = fileLists
            return blockName, block
        else:
            dbs = get_dbs(match['Dbs'])
            if wmspec.getTask(match['TaskName']).parentProcessingFlag():
                dbsBlockDict = dbs.getFileBlockWithParents(blockName)
            else:
                dbsBlockDict = dbs.getFileBlock(blockName)

            if wmspec.locationDataSourceFlag():
                blockInfo = dbsBlockDict[blockName]
                seElements = []
                for cmsSite in match['Inputs'].values()[0]: #TODO: Allow more than one
                    ses = self.SiteDB.cmsNametoSE(cmsSite)
                    seElements.extend(ses)
                seElements = list(set(seElements))
                blockInfo['StorageElements'] = seElements
        return blockName, dbsBlockDict[blockName]
Exemplo n.º 6
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction"""
        validBlocks = []
        # TODO take the chunk size from parameter
        chunkSize = 200

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            #if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(
                acdcInfo['collection'],
                acdcBlockSplit['TaskName'],
                acdcBlockSplit['Offset'],
                acdcBlockSplit['NumOfFiles'],
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            acdcBlocks = acdc.chunkFileset(
                acdcInfo['collection'],
                acdcInfo['fileset'],
                chunkSize,
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            for block in acdcBlocks:
                dbsBlock = {}
                dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(),
                                                  acdcInfo["fileset"],
                                                  block['offset'],
                                                  block['files'])
                dbsBlock['NumberOfFiles'] = block['files']
                dbsBlock['NumberOfEvents'] = block['events']
                dbsBlock['NumberOfLumis'] = block['lumis']
                dbsBlock["Sites"] = sitesFromStorageEelements(
                    block["locations"])
                dbsBlock['ACDC'] = acdcInfo
                validBlocks.append(dbsBlock)

        return validBlocks
Exemplo n.º 7
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        myThread = threading.currentThread()

        lumisPerJob = int(kwargs.get('lumis_per_job', 1))
        splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True))
        ignoreACDC = bool(kwargs.get('ignore_acdc_except', False))
        collectionName = kwargs.get('collectionName', None)
        splitOnRun = kwargs.get('splitOnRun', True)
        getParents = kwargs.get('include_parents', False)
        runWhitelist = kwargs.get('runWhitelist', [])
        runs = kwargs.get('runs', None)
        lumis = kwargs.get('lumis', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner = kwargs.get('owner')
                group = kwargs.get('group')

                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group)
            except Exception, ex:
                msg = "Exception while trying to load goodRunList\n"
                if ignoreACDC:
                    msg += "Ditching goodRunList\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    goodRunList = {}
                else:
                    msg += "Refusing to create any jobs.\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    return
Exemplo n.º 8
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        myThread = threading.currentThread()

        lumisPerJob = int(kwargs.get('lumis_per_job', 1))
        splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True))
        ignoreACDC = bool(kwargs.get('ignore_acdc_except', False))
        collectionName = kwargs.get('collectionName', None)
        splitOnRun = kwargs.get('splitOnRun', True)
        getParents = kwargs.get('include_parents', False)
        runWhitelist = kwargs.get('runWhitelist', [])
        runs = kwargs.get('runs', None)
        lumis = kwargs.get('lumis', None)

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner = kwargs.get('owner')
                group = kwargs.get('group')

                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(collectionName, filesetName,
                                                   owner, group)
            except Exception, ex:
                msg = "Exception while trying to load goodRunList\n"
                if ignoreACDC:
                    msg += "Ditching goodRunList\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    goodRunList = {}
                else:
                    msg += "Refusing to create any jobs.\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    return
    def testFailedJobsUniqueWf(self):
        """
        Performance test of failedJobs with all failed jobs belonging
        to the same workflow and the same task name
        """
        loadList = []
        for i in range(1, 5000):
            loadList.append(self.jobConfig('wf1', '/wf1/task1', i, 'lfn1'))

        dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc")
        dcs.failedJobs(loadList)
        return
Exemplo n.º 10
0
    def testFailedJobsUniqueWf(self):
        """
        Performance test of failedJobs with all failed jobs belonging
        to the same workflow and the same task name
        """
        loadList = []
        for i in range(1, 5000):
            loadList.append(self.jobConfig('wf1', '/wf1/task1', i, 'lfn1'))

        dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc")
        dcs.failedJobs(loadList)
        return
Exemplo n.º 11
0
    def createResubmitSpec(self, serverUrl, couchDB):
        """
        _createResubmitSpec_
        Create a bogus resubmit workload.
        """
        self.site = "cmssrm.fnal.gov"
        workload = WMWorkloadHelper(WMWorkload("TestWorkload"))
        reco = workload.newTask("reco")
        workload.setOwnerDetails(name = "evansde77", group = "DMWM")

        # first task uses the input dataset
        reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1")
        reco.data.input.splitting.algorithm = "File"
        reco.setTaskType("Processing")
        cmsRunReco = reco.makeStep("cmsRun1")
        cmsRunReco.setStepType("CMSSW")
        reco.applyTemplates()
        cmsRunRecoHelper = cmsRunReco.getTypeHelper()
        cmsRunRecoHelper.addOutputModule("outputRECO",
                                        primaryDataset = "PRIMARY",
                                        processedDataset = "processed-v2",
                                        dataTier = "TIER2",
                                        lfnBase = "/store/dunkindonuts",
                                        mergedLFNBase = "/store/kfc")
        
        dcs = DataCollectionService(url = serverUrl, database = couchDB)

        def getJob(workload):
            job = Job()
            job["task"] = workload.getTask("reco").getPathName()
            job["workflow"] = workload.name()
            job["location"] = self.site
            job["owner"] = "evansde77"
            job["group"] = "DMWM"
            return job

        testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024)
        testFileA.setLocation([self.site])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024)
        testFileB.setLocation([self.site])
        testFileB.addRun(Run(1, 3, 4))
        testJobA = getJob(workload)
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        
        dcs.failedJobs([testJobA])
        topLevelTask = workload.getTopLevelTask()[0]
        workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), 
                          serverUrl, couchDB)
                                  
        return workload
Exemplo n.º 12
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction"""
        validBlocks = []
        # TODO take the chunk size from parameter
        chunkSize = 200

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            #if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(acdcInfo['collection'],
                                      acdcBlockSplit['TaskName'],
                                      acdcBlockSplit['Offset'],
                                      acdcBlockSplit['NumOfFiles'],
                                      user = self.wmspec.getOwner().get("name"),
                                      group = self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            acdcBlocks = acdc.chunkFileset(acdcInfo['collection'],
                                           acdcInfo['fileset'],
                                           chunkSize,
                                           user = self.wmspec.getOwner().get("name"),
                                           group = self.wmspec.getOwner().get("group"))
            for block in acdcBlocks:
                dbsBlock = {}
                dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(),
                                                  acdcInfo["fileset"],
                                                  block['offset'], block['files'])
                dbsBlock['NumberOfFiles'] = block['files']
                dbsBlock['NumberOfEvents'] = block['events']
                dbsBlock['NumberOfLumis'] = block['lumis']
                dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"])
                dbsBlock['ACDC'] = acdcInfo
                validBlocks.append(dbsBlock)

        return validBlocks
Exemplo n.º 13
0
def main():
    start = time.time()
    # blockName = match['Inputs'].keys()[0]
    blockName = "/acdc/vlimant_ACDC0_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_190218_145226_481/:pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222:SUS-RunIIFall18wmLHEGS-00025_0/0/31055"

    # acdcInfo = match['ACDC']
    acdcInfo = {"database": "acdcserver",
                "fileset": "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0",
                "collection": "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222",
                "server": "https://cmsweb.cern.ch/couchdb"}

    acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
    splitedBlockName = ACDCBlock.splitBlockName(blockName)
    print("Splitted block name: %s" % splitedBlockName)

    fileLists = acdc.getChunkFiles(acdcInfo['collection'],
                                   acdcInfo['fileset'],
                                   splitedBlockName['Offset'],
                                   splitedBlockName['NumOfFiles'])
    print("Retrieved %d unique files from the ACDCServer" % len(fileLists))

    block = {}
    block["Files"] = fileLists

    wantedLumis = set([252052, 240646])
    for f in fileLists:
        for run in f['runs']:
            maskDict = run.json()
            lumisSet = set(maskDict['Lumis'].keys())
            if wantedLumis.intersection(lumisSet):
                print("File: %s with events: %s, contains these lumis: %s" % (f['lfn'], f['events'], wantedLumis.intersection(lumisSet)))

    # with open("chunkfiles.json", 'w') as fo:
    #     json.dump(block, fo)

    end = time.time()
    print("Spent %s secs running so far" % (end - start))
    sys.exit(1)

    ### Now doing the WMBSHelper stuff
    reqUrl = "https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache"
    requestName = "vlimant_ACDC0_task_HIG-RunIIFall17wmLHEGS-01122__v1_T_180808_130708_5376"

    wmspec = WMWorkloadHelper()
    wmspec.loadSpecFromCouch(reqUrl, requestName)
    taskName = "HIG-RunIIFall17DRPremix-00788_0"
    mask = None
    cacheDir = "/data/srv/wmagent/v1.1.14.patch6/install/wmagent/WorkQueueManager/cache"
    # wmbsHelper = WMBSHelper(wmspec, match['TaskName'], blockName, mask, self.params['CacheDir'])
    wmbsHelper = WMBSHelper(wmspec, taskName, blockName, mask, cacheDir)
    sub, numFilesAdded = wmbsHelper.createSubscriptionAndAddFiles(block=block)
Exemplo n.º 14
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(
                acdcInfo['collection'],
                acdcBlockSplit['TaskName'],
                acdcBlockSplit['Offset'],
                acdcBlockSplit['NumOfFiles'],
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.getTrustSitelists():
                dbsBlock["Sites"] = self.sites
            else:
                # TODO remove this line when all DBS origin_site_name is converted to PNN
                block["locations"] = self.siteDB.checkAndConvertSENameToPNN(
                    block["locations"])
                # upto this
                dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'ACDC is not supported for %s' %
                    self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Exemplo n.º 15
0
    def testFailedJobsScrambledWf(self):
        """
        Performance test of failedJobs where jobs belong to 10 different
        workflows and 3 different tasks
        """
        loadList = []
        for i in range(1, 5000):
            wfName = "wf%d" % (i % 10)
            taskName = "/wf%d/task%d" % (i % 10, i % 3)
            loadList.append(self.jobConfig(wfName, taskName, i, '/file/name/lfn1'))

        dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc")
        dcs.failedJobs(loadList)
        return
Exemplo n.º 16
0
    def testFailedJobsScrambledWf(self):
        """
        Performance test of failedJobs where jobs belong to 10 different
        workflows and 3 different tasks
        """
        loadList = []
        for i in range(1, 5000):
            wfName = "wf%d" % (i % 10)
            taskName = "/wf%d/task%d" % (i % 10, i % 3)
            loadList.append(self.jobConfig(wfName, taskName, i, '/file/name/lfn1'))

        dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc")
        dcs.failedJobs(loadList)
        return
Exemplo n.º 17
0
def main():
    start = time.time()

    # acdcInfo = match['ACDC']
    acdcInfo = {
        "database": "acdcserver",
        "fileset":
        "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0",
        "collection":
        "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222",
        "server": "https://cmsweb.cern.ch/couchdb"
    }

    dcs = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
    #    acdcFileList = dcs.getProductionACDCInfo(acdcInfo['collection'], acdcInfo['fileset'])

    files = dcs._getFilesetInfo(acdcInfo['collection'], acdcInfo['fileset'])
    print("%s" % pformat(files[0]))
    files = mergeFilesInfo(files)
    acdcFileList = []
    for value in files:
        fileInfo = {
            "lfn": value["lfn"],
            "first_event": value["first_event"],
            "lumis": value["runs"][0]["lumis"],
            "events": value["events"]
        }
        acdcFileList.append(fileInfo)
    #print("Data retrieved:\n%s" % pformat(acdcFileList))
    print("Retrieved %d files from the ACDCServer" % len(acdcFileList))

    listLumis = []
    wantedLumis = set([252052, 240646])
    for f in acdcFileList:
        listLumis.extend(f['lumis'])
        lumisSet = set(f['lumis'])
        if wantedLumis.intersection(lumisSet):
            print("File: %s with events: %s, contains these lumis: %s" %
                  (f['lfn'], f['events'], f['lumis']))

    print("Total amount of lumis: %d, where unique are: %d" %
          (len(listLumis), len(set(listLumis))))
    # with open("chunkfiles.json", 'w') as fo:
    #     json.dump(block, fo)

    end = time.time()
    print("Spent %s secs running so far" % (end - start))
    sys.exit(1)
Exemplo n.º 18
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)

        self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available    
        self.initAlerts(compName = "ErrorHandler")        
        
        return
Exemplo n.º 19
0
    def setUp(self):
        """
        setup for test.
        """
        myThread = threading.currentThread()
        
        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase = True)
        self.testInit.setSchema(customModules = ["WMCore.WMBS"],
                                useDefault = False)
        self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC")
        self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump")
        self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump")

        self.daofactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.getJobs = self.daofactory(classname = "Jobs.GetAllJobs")
        self.setJobTime = self.daofactory(classname = "Jobs.SetStateTime")
        locationAction = self.daofactory(classname = "Locations.New")
        locationAction.execute(siteName = "malpaquet", seName = "malpaquet")
        self.testDir = self.testInit.generateWorkDir()
        self.nJobs = 10

        self.dataCS = DataCollectionService(url = self.testInit.couchUrl,
                                            database = "errorhandler_t")
        
        return
Exemplo n.º 20
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {"default": self.maxRetries}
        if "default" not in self.maxRetries:
            raise ErrorHandlerException("Max retries for the default job type must be specified")

        self.maxProcessSize = getattr(self.config.ErrorHandler, "maxProcessSize", 250)
        self.exitCodes = getattr(self.config.ErrorHandler, "failureExitCodes", [])
        self.maxFailTime = getattr(self.config.ErrorHandler, "maxFailTime", 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, "readFWJR", False)
        self.passCodes = getattr(self.config.ErrorHandler, "passExitCodes", [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database)

        return
Exemplo n.º 21
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(acdcInfo['collection'],
                                      acdcBlockSplit['TaskName'],
                                      acdcBlockSplit['Offset'],
                                      acdcBlockSplit['NumOfFiles'],
                                      user = self.wmspec.getOwner().get("name"),
                                      group = self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.inputLocationFlag():
                dbsBlock["Sites"] = self.sites
            else:
                #TODO remove this line when all DBS origin_site_name is converted to PNN
                block["locations"] = self.siteDB.checkAndConvertSENameToPNN(block["locations"])
                #upto this
                dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Exemplo n.º 22
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if type(self.maxRetries) != dict:
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes',
                                 [])
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName="ErrorHandler")

        # Some exit codes imply an immediate failure, non-configurable
        self.exitCodes.extend(WMJobPermanentSystemErrors)

        return
Exemplo n.º 23
0
    def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None):
        """
        This is not implemented yet
        :return:
        """
        from WMCore.ACDC.DataCollectionService import DataCollectionService

        goodRunList = None
        try:
            logging.info('Creating jobs for ACDC fileset %s', filesetName)
            dcs = DataCollectionService(couchURL, couchDB)
            goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName)
        except Exception as ex:
            msg = "Exception while trying to load goodRunList. "
            msg += "Refusing to create any jobs.\nDetails: %s" % ex.__str__()
            logging.exception(msg)

        return goodRunList
Exemplo n.º 24
0
    def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None):
        """
        This is not implemented yet
        :return:
        """
        from WMCore.ACDC.DataCollectionService import DataCollectionService

        goodRunList = None
        try:
            logging.info('Creating jobs for ACDC fileset %s', filesetName)
            dcs = DataCollectionService(couchURL, couchDB)
            goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName)
        except Exception as ex:
            msg = "Exception while trying to load goodRunList. "
            msg += "Refusing to create any jobs.\nDetails: %s" % str(ex)
            logging.exception(msg)

        return goodRunList
Exemplo n.º 25
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(next(iter(self.data)))
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = next(iter(self.data))
            block = acdc.getChunkInfo(acdcInfo['collection'],
                                      acdcBlockSplit['TaskName'],
                                      acdcBlockSplit['Offset'],
                                      acdcBlockSplit['NumOfFiles'])
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.getTrustSitelists().get('trustlists'):
                dbsBlock["Sites"] = self.sites
            else:
                dbsBlock["Sites"] = self.cric.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'ACDC is not supported for %s' %
                    self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Exemplo n.º 26
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(acdcInfo['collection'],
                                      acdcBlockSplit['TaskName'],
                                      acdcBlockSplit['Offset'],
                                      acdcBlockSplit['NumOfFiles'])
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.getTrustSitelists().get('trustlists'):
                dbsBlock["Sites"] = self.sites
            else:
                dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(self.wmspec, 'ACDC is not supported for %s' % self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Exemplo n.º 27
0
def main():
    start = time.time()

    # acdcInfo = match['ACDC']
    acdcInfo = {"database": "acdcserver",
                "fileset": "/pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222/SUS-RunIIFall18wmLHEGS-00025_0",
                "collection": "pdmvserv_task_SUS-RunIIFall18wmLHEGS-00025__v1_T_181211_005112_2222",
                "server": "https://cmsweb.cern.ch/couchdb"}

    dcs = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
#    acdcFileList = dcs.getProductionACDCInfo(acdcInfo['collection'], acdcInfo['fileset'])

    files = dcs._getFilesetInfo(acdcInfo['collection'], acdcInfo['fileset'])
    print("%s" % pformat(files[0]))
    files = mergeFilesInfo(files)
    acdcFileList = []
    for value in files:
        fileInfo = {"lfn": value["lfn"],
                    "first_event": value["first_event"],
                    "lumis": value["runs"][0]["lumis"],
                    "events": value["events"]}
        acdcFileList.append(fileInfo)
    #print("Data retrieved:\n%s" % pformat(acdcFileList))
    print("Retrieved %d files from the ACDCServer" % len(acdcFileList))

    listLumis = []
    wantedLumis = set([252052, 240646])
    for f in acdcFileList:
        listLumis.extend(f['lumis'])
        lumisSet = set(f['lumis'])
        if wantedLumis.intersection(lumisSet):
            print("File: %s with events: %s, contains these lumis: %s" % (f['lfn'], f['events'], f['lumis']))

    print("Total amount of lumis: %d, where unique are: %d" % (len(listLumis), len(set(listLumis))))
    # with open("chunkfiles.json", 'w') as fo:
    #     json.dump(block, fo)

    end = time.time()
    print("Spent %s secs running so far" % (end - start))
    sys.exit(1)
Exemplo n.º 28
0
    def test06UploadACDC(self):
        # get previous request we can piggyback on
        for request in reversed(self.__class__.reqmgr.getRequest()):
            request = request['WMCore.RequestManager.DataStructs.Request.Request']['RequestName']
            if 'RequestCancellation_t' in request:
                self.__class__.requestParams['OriginalRequestName'] = request
                break
        else:
            raise nose.SkipTest("no suitable request in reqmgr to resubmit")

        self.__class__.requestParams['InitialTaskPath'] = self.__class__.requestParams['InitialTaskPath'] % self.__class__.requestParams['OriginalRequestName']
        self.__class__.requestParams['ACDCServer'] = self.__class__.endpoint + '/couchdb'

        # create and upload acdc
        service = DataCollectionService(url=self.__class__.endpoint + '/couchdb', database = 'wmagent_acdc')
        service.createCollection(self.__class__.requestParams['OriginalRequestName'], 'integration', 'DMWM')
        with open(os.path.join(getTestBase(), '..', 'data', 'ACDC', 'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735.json')) as infile:
            acdc_json = infile.read().replace('linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735',
                                              self.__class__.requestParams['OriginalRequestName'])
        acdc_json = loads(acdc_json)
        acdc_database = Database('wmagent_acdc', self.__class__.endpoint + '/couchdb')
        acdc_database.commit(acdc_json)
Exemplo n.º 29
0
    def lumiListFromACDC(couchURL=None, couchDB=None, filesetName=None, collectionName=None, ignoreACDC=False):
        """
        This is not implemented yet
        :return:
        """

        try:
            logging.info('Creating jobs for ACDC fileset %s', filesetName)
            dcs = DataCollectionService(couchURL, couchDB)
            goodRunList = dcs.getLumilistWhitelist(collectionName, filesetName)
        except Exception as ex:
            msg = "Exception while trying to load goodRunList\n"
            if ignoreACDC:  # Logic can go in main function?
                msg += "Ditching goodRunList\n" + str(ex) + str(traceback.format_exc())
                logging.error(msg)
                goodRunList = {}
            else:
                msg += "Refusing to create any jobs.\n" + str(ex) + str(traceback.format_exc())
                logging.error(msg)
                return None  # An error condtion - check

        return goodRunList
Exemplo n.º 30
0
    def test06UploadACDC(self):
        # get previous request we can piggyback on
        for request in reversed(self.__class__.reqmgr.getRequest()):
            request = request[
                'WMCore.RequestManager.DataStructs.Request.Request'][
                    'RequestName']
            if 'RequestCancellation_t' in request:
                self.__class__.requestParams['OriginalRequestName'] = request
                break
        else:
            raise nose.SkipTest("no suitable request in reqmgr to resubmit")

        self.__class__.requestParams[
            'InitialTaskPath'] = self.__class__.requestParams[
                'InitialTaskPath'] % self.__class__.requestParams[
                    'OriginalRequestName']
        self.__class__.requestParams[
            'ACDCServer'] = self.__class__.endpoint + '/couchdb'

        # create and upload acdc
        service = DataCollectionService(url=self.__class__.endpoint +
                                        '/couchdb',
                                        database='wmagent_acdc')
        service.createCollection(
            self.__class__.requestParams['OriginalRequestName'], 'integration',
            'DMWM')
        with open(
                os.path.join(
                    getTestBase(), '..', 'data', 'ACDC',
                    'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735.json'
                )) as infile:
            acdc_json = infile.read().replace(
                'linacre_ACDC_ReReco13JulCosmics_120809_130020_117_120823_200309_5735',
                self.__class__.requestParams['OriginalRequestName'])
        acdc_json = loads(acdc_json)
        acdc_database = Database('wmagent_acdc',
                                 self.__class__.endpoint + '/couchdb')
        acdc_database.commit(acdc_json)
Exemplo n.º 31
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        self.setupComponentParam()

        return
Exemplo n.º 32
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        An event base splitting algorithm.  All available files are split into a
        set number of events per job.
        """
        eventsPerJob = int(kwargs.get("events_per_job", 100))
        eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob))
        getParents = kwargs.get("include_parents", False)
        lheInput = kwargs.get("lheInputFiles", False)
        collectionName = kwargs.get('collectionName', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))
        acdcFileList = []

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner = kwargs.get('owner')
                group = kwargs.get('group')
                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                acdcFileList = dcs.getProductionACDCInfo(
                    collectionName, filesetName, owner, group)
            except Exception, ex:
                msg = "Exception while trying to load goodRunList\n"
                msg += "Refusing to create any jobs.\n"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logging.error(msg)
                return
Exemplo n.º 33
0
    def _getDBSBlock(self, match, wmspec):
        """Get DBS info for this block"""
        blockName = match['Inputs'].keys()[0]  #TODO: Allow more than one

        if match['ACDC']:
            acdcInfo = match['ACDC']
            acdc = DataCollectionService(acdcInfo["server"],
                                         acdcInfo["database"])
            collection = acdc.getDataCollection(acdcInfo['collection'])
            splitedBlockName = ACDCBlock.splitBlockName(blockName)
            fileLists = acdc.getChunkFiles(
                acdcInfo['collection'],
                acdcInfo['fileset'],
                splitedBlockName['Offset'],
                splitedBlockName['NumOfFiles'],
                user=wmspec.getOwner().get("name"),
                group=wmspec.getOwner().get("group"))
            block = {}
            block["Files"] = fileLists
            return blockName, block
        else:
            dbs = get_dbs(match['Dbs'])
            if wmspec.getTask(match['TaskName']).parentProcessingFlag():
                dbsBlockDict = dbs.getFileBlockWithParents(blockName)
            else:
                dbsBlockDict = dbs.getFileBlock(blockName)

            if wmspec.locationDataSourceFlag():
                blockInfo = dbsBlockDict[blockName]
                seElements = []
                for cmsSite in match['Inputs'].values(
                )[0]:  #TODO: Allow more than one
                    ses = self.SiteDB.cmsNametoSE(cmsSite)
                    seElements.extend(ses)
                seElements = list(set(seElements))
                blockInfo['StorageElements'] = seElements
        return blockName, dbsBlockDict[blockName]
Exemplo n.º 34
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        An event base splitting algorithm.  All available files are split into a
        set number of events per job.
        """
        eventsPerJob = int(kwargs.get("events_per_job", 100))
        eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob))
        getParents   = kwargs.get("include_parents", False)
        lheInput = kwargs.get("lheInputFiles", False)
        collectionName  = kwargs.get('collectionName', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))
        acdcFileList = []

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL       = kwargs.get('couchURL')
                couchDB        = kwargs.get('couchDB')
                filesetName    = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner          = kwargs.get('owner')
                group          = kwargs.get('group')
                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName, owner, group)
            except Exception, ex:
                msg =  "Exception while trying to load goodRunList\n"
                msg +=  "Refusing to create any jobs.\n"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logging.error(msg)
                return
Exemplo n.º 35
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        if type(self.maxRetries) != dict:
            self.maxRetries = {'default' : self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.exitCodes      = getattr(self.config.ErrorHandler, 'failureExitCodes', [])
        self.maxFailTime    = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR       = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes      = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs    = self.daoFactory(classname = "Jobs.GetAllJobs")
        self.idLoad     = self.daoFactory(classname = "Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName = "ErrorHandler")

        # Some exit codes imply an immediate failure, non-configurable
        self.exitCodes.extend(WMJobPermanentSystemErrors)

        return
Exemplo n.º 36
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
            self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries")

        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl,
                                                    database=config.ACDC.database)

        return
Exemplo n.º 37
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl,
                                                    database=config.ACDC.database)

        self.setupComponentParam()

        return
Exemplo n.º 38
0
class AccountantWorker(WMConnectionBase):
    """
    Class that actually does the work of parsing FWJRs for the Accountant
    Run through ProcessPool
    """
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn_to_psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()

        return

    def reset(self):
        """
        _reset_

        Reset all global vars between runs.
        """
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        gc.collect()
        return

    def loadJobReport(self, parameters):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        jobReportPath = parameters.get("fwjr_path", None)
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999,
                                           "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://", "")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99999,
                'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception as ex:
            msg = "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            logging.debug("Failing job: %s\n" % parameters)
            return self.createMissingFWKJR(parameters, 99997,
                                           'Cannot load jobReport')

        if len(jobReport.listSteps()) == 0:
            logging.error("FwkJobReport with no steps: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99997,
                'jobReport with no steps: %s ' % jobReportPath)

        return jobReport

    def isTaskExistInFWJR(self, jobReport, jobStatus):
        """
        If taskName is not available in the FWJR, then tries to
        recover it getting data from the SQL database.
        """
        if not jobReport.getTaskName():
            logging.warning(
                "Trying to recover a corrupted FWJR for a %s job with job id %s"
                % (jobStatus, jobReport.getJobID()))
            jobInfo = self.getJobTaskNameAction.execute(
                jobId=jobReport.getJobID(),
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            jobReport.setTaskName(jobInfo['taskName'])
            jobReport.save(jobInfo['fwjr_path'])
            if not jobReport.getTaskName():
                msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (
                    jobStatus, jobReport.getJobID())
                raise AccountantWorkerException(msg)
            else:
                logging.info(
                    "TaskName '%s' successfully recovered and added to fwjr id %s."
                    % (jobReport.getTaskName(), jobReport.getJobID()))

        return

    def __call__(self, parameters):
        """
        __call__

        Handle a completed job.  The parameters dictionary will contain the job
        ID and the path to the framework job report.
        """
        returnList = []
        self.reset()

        for job in parameters:
            logging.info("Handling %s" % job["fwjr_path"])

            # Load the job and set the ID
            fwkJobReport = self.loadJobReport(job)
            fwkJobReport.setJobID(job['id'])

            jobSuccess = self.handleJob(jobID=job["id"],
                                        fwkJobReport=fwkJobReport)

            if self.returnJobReport:
                returnList.append({
                    'id': job["id"],
                    'jobSuccess': jobSuccess,
                    'jobReport': fwkJobReport
                })
            else:
                returnList.append({'id': job["id"], 'jobSuccess': jobSuccess})

            self.count += 1

        self.beginTransaction()

        # Now things done at the end of the job
        # Do what we can with WMBS files
        self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds)

        # handle merge files separately since parentage need to set
        # separately to support robust merge
        self.handleWMBSFiles(self.wmbsMergeFilesToBuild,
                             self.parentageBindsForMerge)

        # Create DBSBufferFiles
        self.createFilesInDBSBuffer()

        # Handle filesetAssoc
        if len(self.filesetAssoc) > 0:
            self.bulkAddToFilesetAction.execute(
                binds=self.filesetAssoc,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Move successful jobs to successful
        if len(self.listOfJobsToSave) > 0:
            idList = [x['id'] for x in self.listOfJobsToSave]
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToSave]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.jobCompleteInput.execute(
                id=idList,
                lfnsToSkip=self.jobsWithSkippedFiles,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToSave, "success",
                                        "complete")

        # If we have failed jobs, fail them
        if len(self.listOfJobsToFail) > 0:
            outcomeBinds = [{
                'jobid': x['id'],
                'outcome': x['outcome']
            } for x in self.listOfJobsToFail]
            self.setBulkOutcome.execute(binds=outcomeBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed",
                                        "complete")

        # Arrange WMBS parentage
        if len(self.parentageBinds) > 0:
            self.setParentageByJob.execute(
                binds=self.parentageBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())
        if len(self.parentageBindsForMerge) > 0:
            self.setParentageByMergeJob.execute(
                binds=self.parentageBindsForMerge,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        # Straighten out DBS Parentage
        if len(self.mergedOutputFiles) > 0:
            self.handleDBSBufferParentage()

        if len(self.jobsWithSkippedFiles) > 0:
            self.handleSkippedFiles()

        self.commitTransaction(existingTransaction=False)

        return returnList

    def outputFilesetsForJob(self, outputMap, merged, moduleLabel):
        """
        _outputFilesetsForJob_

        Determine if the file should be placed in any other fileset.  Note that
        this will not return the JobGroup output fileset as all jobs will have
        their output placed there.
        """
        if moduleLabel not in outputMap:
            logging.info("Output module label missing from output map.")
            return []

        outputFilesets = []
        for outputFileset in outputMap[moduleLabel]:
            if merged == False and outputFileset["output_fileset"] != None:
                outputFilesets.append(outputFileset["output_fileset"])
            else:
                if outputFileset["merged_output_fileset"] != None:
                    outputFilesets.append(
                        outputFileset["merged_output_fileset"])

        return outputFilesets

    def addFileToDBS(self, jobReportFile, task, errorDataset=False):
        """
        _addFileToDBS_

        Add a file that was output from a job to the DBS buffer.
        """
        datasetInfo = jobReportFile["dataset"]

        dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"],
                                size=jobReportFile["size"],
                                events=jobReportFile["events"],
                                checksums=jobReportFile["checksums"],
                                status="NOTUPLOADED")
        dbsFile.setAlgorithm(appName=datasetInfo["applicationName"],
                             appVer=datasetInfo["applicationVersion"],
                             appFam=jobReportFile["module_label"],
                             psetHash="GIBBERISH",
                             configContent=jobReportFile.get('configURL'))

        if errorDataset:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"] + "-Error",
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))
        else:
            dbsFile.setDatasetPath(
                "/%s/%s/%s" %
                (datasetInfo["primaryDataset"],
                 datasetInfo["processedDataset"], datasetInfo["dataTier"]))

        dbsFile.setValidStatus(
            validStatus=jobReportFile.get("validStatus", None))
        dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None))
        dbsFile.setAcquisitionEra(
            era=jobReportFile.get('acquisitionEra', None))
        dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None))
        #TODO need to find where to get the prep id
        dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None))
        dbsFile['task'] = task

        for run in jobReportFile["runs"]:
            newRun = Run(runNumber=run.run)
            newRun.extend(run.lumis)
            dbsFile.addRun(newRun)

        dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0],
                            immediateSave=False)
        self.dbsFilesToCreate.append(dbsFile)
        return

    def findDBSParents(self, lfn):
        """
        _findDBSParents_

        Find the parent of the file in DBS
        This is meant to be called recursively
        """
        parentsInfo = self.getParentInfoAction.execute(
            [lfn],
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        newParents = set()
        for parentInfo in parentsInfo:
            # This will catch straight to merge files that do not have redneck
            # parents.  We will mark the straight to merge file from the job
            # as a child of the merged parent.
            if int(parentInfo["merged"]) == 1:
                newParents.add(parentInfo["lfn"])

            elif parentInfo['gpmerged'] == None:
                continue

            # Handle the files that result from merge jobs that aren't redneck
            # children.  We have to setup parentage and then check on whether or
            # not this file has any redneck children and update their parentage
            # information.
            elif int(parentInfo["gpmerged"]) == 1:
                newParents.add(parentInfo["gplfn"])

            # If that didn't work, we've reached the great-grandparents
            # And we have to work via recursion
            else:
                parentSet = self.findDBSParents(lfn=parentInfo['gplfn'])
                for parent in parentSet:
                    newParents.add(parent)

        return newParents

    def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None):
        """
        _addFileToWMBS_

        Add a file that was produced in a job to WMBS.
        """
        fwjrFile["first_event"] = jobMask["FirstEvent"]

        if fwjrFile["first_event"] == None:
            fwjrFile["first_event"] = 0

        if jobType == "Merge" and fwjrFile["module_label"] != "logArchive":
            setattr(fwjrFile["fileRef"], 'merged', True)
            fwjrFile["merged"] = True

        wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile,
                                                      jobID=jobID)

        if jobType == "Merge":
            self.wmbsMergeFilesToBuild.append(wmbsFile)
        else:
            self.wmbsFilesToBuild.append(wmbsFile)

        if fwjrFile["merged"]:
            self.addFileToDBS(
                fwjrFile, task, jobType == "Repack"
                and fwjrFile["size"] > self.maxAllowedRepackOutputSize)

        return wmbsFile

    def _mapLocation(self, fwkJobReport):
        for file in fwkJobReport.getAllFileRefs():
            if file and hasattr(file, 'location'):
                file.location = self.phedex.getBestNodeName(
                    file.location, self.locLists)

    def handleJob(self, jobID, fwkJobReport):
        """
        _handleJob_

        Figure out if a job was successful or not, handle it appropriately
        (parse FWJR, update WMBS) and return the success status as a boolean

        """
        jobSuccess = fwkJobReport.taskSuccessful()

        outputMap = self.getOutputMapAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        jobType = self.getJobTypeAction.execute(
            jobID=jobID,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())

        if jobSuccess:
            fileList = fwkJobReport.getAllFiles()

            # consistency check comparing outputMap to fileList
            # they should match except for some limited special cases
            outputModules = set([])
            for fwjrFile in fileList:
                outputModules.add(fwjrFile['outputModule'])
            if set(outputMap.keys()) == outputModules:
                pass
            elif jobType == "LogCollect" and len(
                    outputMap.keys()) == 0 and outputModules == set(
                        ['LogCollect']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set([
                    'Merged', 'MergedError', 'logArchive'
            ]) and outputModules == set(['Merged', 'logArchive']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set([
                    'Merged', 'MergedError', 'logArchive'
            ]) and outputModules == set(['MergedError', 'logArchive']):
                pass
            elif jobType == "Express" and set(
                    outputMap.keys()).difference(outputModules) == set(
                        ['write_RAW']):
                pass
            else:
                failJob = True
                if jobType in ["Processing", "Production"]:
                    cmsRunSteps = 0
                    for step in fwkJobReport.listSteps():
                        if step.startswith("cmsRun"):
                            cmsRunSteps += 1
                    if cmsRunSteps > 1:
                        failJob = False

                if failJob:
                    jobSuccess = False
                    logging.error(
                        "Job %d , list of expected outputModules does not match job report, failing job",
                        jobID)
                    logging.debug("Job %d , expected outputModules %s", jobID,
                                  sorted(outputMap.keys()))
                    logging.debug("Job %d , fwjr outputModules %s", jobID,
                                  sorted(outputModules))
                    fileList = fwkJobReport.getAllFilesFromStep(
                        step='logArch1')
                else:
                    logging.debug(
                        "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job",
                        jobID)
        else:
            fileList = fwkJobReport.getAllFilesFromStep(step='logArch1')

        if jobSuccess:
            logging.info("Job %d , handle successful job", jobID)
        else:
            logging.warning("Job %d , bad jobReport, failing job", jobID)

        # make sure the task name is present in FWJR (recover from WMBS if needed)
        if len(fileList) > 0:
            if jobSuccess:
                self.isTaskExistInFWJR(fwkJobReport, "success")
            else:
                self.isTaskExistInFWJR(fwkJobReport, "failed")

        # special check for LogCollect jobs
        skipLogCollect = False
        if jobSuccess and jobType == "LogCollect":
            for fwjrFile in fileList:
                try:
                    # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes
                    self.associateLogCollectToParentJobsInWMStats(
                        fwkJobReport, fwjrFile["lfn"],
                        fwkJobReport.getTaskName())
                except Exception as ex:
                    skipLogCollect = True
                    logging.error(
                        "Error occurred: associating log collect location, will try again\n %s"
                        % str(ex))
                    break

        # now handle the job (unless the special LogCollect check failed)
        if not skipLogCollect:

            wmbsJob = Job(id=jobID)
            wmbsJob.load()
            outputID = wmbsJob.loadOutputID()
            wmbsJob.getMask()

            wmbsJob["fwjr"] = fwkJobReport

            if jobSuccess:
                wmbsJob["outcome"] = "success"
            else:
                wmbsJob["outcome"] = "failure"

            for fwjrFile in fileList:

                logging.debug("Job %d , register output %s", jobID,
                              fwjrFile["lfn"])

                wmbsFile = self.addFileToWMBS(jobType,
                                              fwjrFile,
                                              wmbsJob["mask"],
                                              jobID=jobID,
                                              task=fwkJobReport.getTaskName())
                merged = fwjrFile['merged']
                moduleLabel = fwjrFile["module_label"]

                if merged:
                    self.mergedOutputFiles.append(wmbsFile)

                self.filesetAssoc.append({
                    "lfn": wmbsFile["lfn"],
                    "fileset": outputID
                })

                # LogCollect jobs have no output fileset
                if jobType == "LogCollect":
                    pass
                # Repack jobs that wrote too large merged output skip output filesets
                elif jobType == "Repack" and merged and wmbsFile[
                        "size"] > self.maxAllowedRepackOutputSize:
                    pass
                else:
                    outputFilesets = self.outputFilesetsForJob(
                        outputMap, merged, moduleLabel)
                    for outputFileset in outputFilesets:
                        self.filesetAssoc.append({
                            "lfn": wmbsFile["lfn"],
                            "fileset": outputFileset
                        })

            # Check if the job had any skipped files, put them in ACDC containers
            # We assume full file processing (no job masks)
            if jobSuccess:
                skippedFiles = fwkJobReport.getAllSkippedFiles()
                if skippedFiles and jobType not in ['LogCollect', 'Cleanup']:
                    self.jobsWithSkippedFiles[jobID] = skippedFiles

            # Only save once job is done, and we're sure we made it through okay
            self._mapLocation(wmbsJob['fwjr'])
            if jobSuccess:
                self.listOfJobsToSave.append(wmbsJob)
            else:
                self.listOfJobsToFail.append(wmbsJob)

        return jobSuccess

    def associateLogCollectToParentJobsInWMStats(self, fwkJobReport,
                                                 logAchiveLFN, task):
        """
        _associateLogCollectToParentJobsInWMStats_

        Associate a logArchive output to its parent job
        """
        inputFileList = fwkJobReport.getAllInputFiles()
        requestName = task.split('/')[1]
        keys = []
        for inputFile in inputFileList:
            keys.append([requestName, inputFile["lfn"]])
        resultRows = self.fwjrCouchDB.loadView(
            "FWJRDump",
            'jobsByOutputLFN',
            options={"stale": "update_after"},
            keys=keys)['rows']
        if len(resultRows) > 0:
            #get data from wmbs
            parentWMBSJobIDs = []
            for row in resultRows:
                parentWMBSJobIDs.append({"jobid": row["value"]})
            #update Job doc in wmstats
            results = self.getJobInfoByID.execute(parentWMBSJobIDs)
            parentJobNames = []

            if isinstance(results, list):
                for jobInfo in results:
                    parentJobNames.append(jobInfo['name'])
            else:
                parentJobNames.append(results['name'])

            self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN)
        else:
            #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0)
            #It need to be failed and retried.
            logging.error(
                "job report is missing for updating log archive mapping\n Input file list\n %s"
                % inputFileList)

        return

    def createMissingFWKJR(self,
                           parameters,
                           errorCode=999,
                           errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report

    def createFilesInDBSBuffer(self):
        """
        _createFilesInDBSBuffer_
        It does the actual job of creating things in DBSBuffer
        WARNING: This assumes all files in a job have the same final location
        """
        if len(self.dbsFilesToCreate) == 0:
            # Whoops, nothing to do!
            return

        dbsFileTuples = []
        dbsFileLoc = []
        dbsCksumBinds = []
        runLumiBinds = []
        selfChecksums = None
        jobLocations = set()

        for dbsFile in self.dbsFilesToCreate:
            # Append a tuple in the format specified by DBSBufferFiles.Add
            # Also run insertDatasetAlgo

            assocID = None
            datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (
                dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"],
                dbsFile["appFam"], dbsFile["psetHash"],
                dbsFile['processingVer'], dbsFile['acquisitionEra'],
                dbsFile['globalTag'])
            # First, check if this is in the cache
            if datasetAlgoPath in self.datasetAlgoPaths:
                for da in self.datasetAlgoID:
                    if da['datasetAlgoPath'] == datasetAlgoPath:
                        assocID = da['assocID']
                        break

            if not assocID:
                # Then we have to get it ourselves
                try:
                    assocID = dbsFile.insertDatasetAlgo()
                    self.datasetAlgoPaths.append(datasetAlgoPath)
                    self.datasetAlgoID.append({
                        'datasetAlgoPath': datasetAlgoPath,
                        'assocID': assocID
                    })
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath
                    msg += str(ex)
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            # Associate the workflow to the file using the taskPath and the requestName
            # TODO: debug why it happens and then drop/recover these cases automatically
            taskPath = dbsFile.get('task')
            if not taskPath:
                msg = "Can't do workflow association, report this error to a developer.\n"
                msg += "DbsFile : %s" % str(dbsFile)
                raise AccountantWorkerException(msg)
            workflowName = taskPath.split('/')[1]
            workflowPath = '%s:%s' % (workflowName, taskPath)
            if workflowPath in self.workflowPaths:
                for wf in self.workflowIDs:
                    if wf['workflowPath'] == workflowPath:
                        workflowID = wf['workflowID']
                        break
            else:
                result = self.dbsGetWorkflow.execute(
                    workflowName,
                    taskPath,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
                workflowID = result['id']

            self.workflowPaths.append(workflowPath)
            self.workflowIDs.append({
                'workflowPath': workflowPath,
                'workflowID': workflowID
            })

            lfn = dbsFile['lfn']
            selfChecksums = dbsFile['checksums']
            jobLocation = dbsFile.getLocations()[0]
            jobLocations.add(jobLocation)
            dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'],
                                  assocID, dbsFile['status'], workflowID))

            dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation})
            if dbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']})

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    dbsCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

        try:

            diffLocation = jobLocations.difference(self.dbsLocations)

            for jobLocation in diffLocation:
                self.dbsInsertLocation.execute(
                    siteName=jobLocation,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
                self.dbsLocations.add(jobLocation)

            self.dbsCreateFiles.execute(files=dbsFileTuples,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetLocation.execute(binds=dbsFileLoc,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            self.dbsSetChecksum.execute(bulkList=dbsCksumBinds,
                                        conn=self.getDBConn(),
                                        transaction=self.existingTransaction())

            if len(runLumiBinds) > 0:
                self.dbsSetRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
        except WMException:
            raise
        except Exception as ex:
            msg = "Got exception while inserting files into DBSBuffer!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Listing binds:")
            logging.debug("jobLocation: %s\n" % jobLocation)
            logging.debug("dbsFiles: %s\n" % dbsFileTuples)
            logging.debug("dbsFileLoc: %s\n" % dbsFileLoc)
            logging.debug("Checksum binds: %s\n" % dbsCksumBinds)
            logging.debug("RunLumi binds: %s\n" % runLumiBinds)
            raise AccountantWorkerException(msg)

        # Now that we've created those files, clear the list
        self.dbsFilesToCreate = []
        return

    def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds):
        """
        _handleWMBSFiles_

        Do what can be done in bulk in bulk
        """
        if len(wmbsFilesToBuild) == 0:
            # Nothing to do
            return

        runLumiBinds = []
        fileCksumBinds = []
        fileLocations = []
        fileCreate = []

        for wmbsFile in wmbsFilesToBuild:
            lfn = wmbsFile['lfn']
            if lfn == None:
                continue

            selfChecksums = wmbsFile['checksums']
            # by jobType add to different parentage relation
            # if it is the merge job, don't include the parentage on failed input files.
            # otherwise parentage is set for all input files.
            parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']})

            if wmbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']})

            if len(wmbsFile.getLocations()) > 0:
                outpnn = wmbsFile.getLocations()[0]
                if self.pnn_to_psn.get(outpnn, None):
                    fileLocations.append({'lfn': lfn, 'location': outpnn})
                else:
                    msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    fileCksumBinds.append({
                        'lfn': lfn,
                        'cksum': selfChecksums[entry],
                        'cktype': entry
                    })

            fileCreate.append([
                lfn, wmbsFile['size'], wmbsFile['events'], None,
                wmbsFile["first_event"], wmbsFile['merged']
            ])

        if len(fileCreate) == 0:
            return

        try:

            self.addFileAction.execute(files=fileCreate,
                                       conn=self.getDBConn(),
                                       transaction=self.existingTransaction())

            if runLumiBinds:
                self.setFileRunLumi.execute(
                    file=runLumiBinds,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())

            self.setFileAddChecksum.execute(
                bulkList=fileCksumBinds,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

            self.setFileLocation.execute(
                lfn=fileLocations,
                location=self.fileLocation,
                conn=self.getDBConn(),
                transaction=self.existingTransaction())

        except WMException:
            raise
        except Exception as ex:
            msg = "Error while adding files to WMBS!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Printing binds: \n")
            logging.debug("FileCreate binds: %s\n" % fileCreate)
            logging.debug("Runlumi binds: %s\n" % runLumiBinds)
            logging.debug("Checksum binds: %s\n" % fileCksumBinds)
            logging.debug("FileLocation binds: %s\n" % fileLocations)
            raise AccountantWorkerException(msg)

        # Clear out finished files
        wmbsFilesToBuild = []
        return

    def createFileFromDataStructsFile(self, file, jobID):
        """
        _createFileFromDataStructsFile_

        This function will create a WMBS File given a DataStructs file
        """
        wmbsFile = File()
        wmbsFile.update(file)

        if isinstance(file["locations"], set):
            pnn = list(file["locations"])[0]
        elif isinstance(file["locations"], list):
            if len(file['locations']) > 1:
                logging.error(
                    "Have more then one location for a file in job %i" %
                    (jobID))
                logging.error("Choosing location %s" % (file['locations'][0]))
            pnn = file["locations"][0]
        else:
            pnn = file["locations"]

        wmbsFile["locations"] = set()

        if pnn != None:
            wmbsFile.setLocation(pnn=pnn, immediateSave=False)
        wmbsFile['jid'] = jobID

        return wmbsFile

    def handleDBSBufferParentage(self):
        """
        _handleDBSBufferParentage_

        Handle all the DBSBuffer Parentage in bulk if you can
        """
        outputLFNs = [f['lfn'] for f in self.mergedOutputFiles]
        bindList = []
        for lfn in outputLFNs:
            newParents = self.findDBSParents(lfn=lfn)
            for parentLFN in newParents:
                bindList.append({'child': lfn, 'parent': parentLFN})

        # Now all the parents should exist
        # Commit them to DBSBuffer
        logging.info("About to commit all DBSBuffer Heritage information")
        logging.info(len(bindList))

        if len(bindList) > 0:
            try:
                self.dbsLFNHeritage.execute(
                    binds=bindList,
                    conn=self.getDBConn(),
                    transaction=self.existingTransaction())
            except WMException:
                raise
            except Exception as ex:
                msg = "Error while trying to handle the DBS LFN heritage\n"
                msg += str(ex)
                msg += "BindList: %s" % bindList
                logging.error(msg)
                raise AccountantWorkerException(msg)
        return

    def handleSkippedFiles(self):
        """
        _handleSkippedFiles_

        Handle all the skipped files in bulk,
        the way it handles the skipped files
        imposes an important restriction:
        Skipped files should have been processed by a single job
        in the task and no job mask exists in it.
        This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased
        splitting algorithms.
        Here ACDC records and created and the file are moved
        to wmbs_sub_files_failed from completed.
        """
        jobList = self.getFullJobInfo.execute(
            [{
                'jobid': x
            } for x in self.jobsWithSkippedFiles.keys()],
            fileSelection=self.jobsWithSkippedFiles,
            conn=self.getDBConn(),
            transaction=self.existingTransaction())
        self.dataCollection.failedJobs(jobList, useMask=False)
        return
Exemplo n.º 39
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        if type(self.maxRetries) != dict:
            self.maxRetries = {'default' : self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.exitCodes      = getattr(self.config.ErrorHandler, 'failureExitCodes', [])
        self.maxFailTime    = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR       = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes      = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs    = self.daoFactory(classname = "Jobs.GetAllJobs")
        self.idLoad     = self.daoFactory(classname = "Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName = "ErrorHandler")

        return

    def setup(self, parameters = None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        jobList = [ job for job in jobList if job['type'] not in ['LogCollect','Cleanup'] ]

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state))
        retrydoneJobs = []
        cooloffJobs = []
        passJobs    = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                self.sendAlert(4, msg = msg)
                logging.debug("JobInfo: %s" % job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs, 'retrydone',
                                       '%sfailed' % state, updatesummary = True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs, '%scooloff' % state,
                                       '%sfailed' % state, updatesummary = True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs" % len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []
        for job in jobList:
            report     = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath))
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                if startTime == None or stopTime == None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i" % job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.exitCodes]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    self.sendAlert(4, msg = msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning("Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.debug("About to process %d retry done jobs" % len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()
        
        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.debug("About to process %d failures" % len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """

        # Run over created, submitted and executed job failures
        failure_states = [ 'create', 'submit', 'job' ]
        for state in failure_states:
            idList = self.getJobs.execute(state = "%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed" % (len(idList), state))
            while len(idList) > 0:
                tmpList = idList[:self.maxProcessSize]
                idList = idList[self.maxProcessSize:]
                jobList = self.loadJobsFromList(tmpList)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state = 'retrydone')
        logging.info("Found %d jobs done with all retries" % len(idList))
        while len(idList) > 0:
            tmpList = idList[:self.maxProcessSize]
            idList = idList[self.maxProcessSize:]
            jobList = self.loadJobsFromList(tmpList)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.idLoad.execute(jobID = binds)

        # You have to have a list
        if type(results) == dict:
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id = entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs


    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID = binds)

        # You have to have a list
        if type(results) == dict:
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id = entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def algorithm(self, parameters = None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        myThread = threading.currentThread()
        try:
            self.handleErrors()
        except WMException as ex:
            try:
                myThread.transaction.rollback()
            except:
                pass
            raise
        except CouchConnectionError as ex:
            msg = "Caught CouchConnectionError exception in ErrorHandler\n"
            msg += "transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.exception(msg)
        except Exception as ex:
            msg = "Caught exception in ErrorHandler\n"
            msg += str(ex)
            msg += str(traceback.format_exc())
            msg += "\n\n"
            logging.error(msg)
            self.sendAlert(6, msg = msg)
            if getattr(myThread, 'transaction', None) != None \
               and getattr(myThread.transaction, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise ErrorHandlerException(msg)
Exemplo n.º 40
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        avgEventsPerJob = int(kwargs.get('events_per_job', 5000))
        jobLimit = int(kwargs.get('job_limit', 0))
        jobTimeLimit = int(
            kwargs.get('job_time_limit', self.defaultJobTimeLimit))
        totalEvents = int(kwargs.get('total_events', 0))
        splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False))
        self.collectionName = kwargs.get('collectionName', None)
        splitOnRun = kwargs.get('splitOnRun', True)
        getParents = kwargs.get('include_parents', False)
        runWhitelist = kwargs.get('runWhitelist', [])
        runs = kwargs.get('runs', None)
        lumis = kwargs.get('lumis', None)
        applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False))
        deterministicPileup = kwargs.get('deterministicPileup', False)

        timePerEvent, sizePerEvent, memoryRequirement = \
            self.getPerformanceParameters(kwargs.get('performance', {}))

        eventsPerLumiInDataset = 0

        if avgEventsPerJob <= 0:
            msg = "events_per_job parameter must be positive. Its value is: %d" % avgEventsPerJob
            raise RuntimeError(msg)

        if self.package == 'WMCore.WMBS':
            self.loadRunLumi = self.daoFactory(
                classname="Files.GetBulkRunLumi")
            if deterministicPileup:
                getJobNumber = self.daoFactory(
                    classname="Jobs.GetNumberOfJobsPerWorkflow")
                self.nJobs = getJobNumber.execute(
                    workflow=self.subscription.getWorkflow().id)
                logging.info(
                    'Creating jobs in DeterministicPileup mode for %s',
                    self.subscription.workflowName())

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if self.collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')

                logging.info('Creating jobs for ACDC fileset %s', filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(self.collectionName,
                                                   filesetName)
            except Exception as ex:
                msg = "Exception while trying to load goodRunList. "
                msg += "Refusing to create any jobs.\nDetails: %s" % str(ex)
                logging.exception(msg)
                return

        lDict = self.getFilesSortedByLocation(avgEventsPerJob)
        if not lDict:
            logging.info(
                "There are not enough events/files to be splitted. Trying again next cycle"
            )
            return

        locationDict = {}
        for key in lDict.keys():
            newlist = []
            # First we need to load the data
            if self.loadRunLumi:
                fileLumis = self.loadRunLumi.execute(files=lDict[key])
                if not fileLumis:
                    logging.warning(
                        "Empty fileLumis dict for workflow %s, subs %s.",
                        self.subscription.workflowName(),
                        self.subscription['id'])
                for f in lDict[key]:
                    lumiDict = fileLumis.get(f['id'], {})
                    for run in lumiDict.keys():
                        f.addRun(run=Run(run, *lumiDict[run]))

            for f in lDict[key]:
                if len(f['runs']) == 0:
                    continue
                f['runs'] = sorted(f['runs'])
                f['lumiCount'] = 0
                for run in f['runs']:
                    run.lumis.sort()
                    f['lumiCount'] += len(run.lumis)
                f['lowestRun'] = f['runs'][0]

                # Do average event per lumi calculation
                if f['lumiCount']:
                    f['avgEvtsPerLumi'] = round(
                        float(f['events']) / f['lumiCount'])
                    if deterministicPileup:
                        # We assume that all lumis are equal in the dataset
                        eventsPerLumiInDataset = f['avgEvtsPerLumi']
                else:
                    # No lumis in the file, ignore it
                    continue
                newlist.append(f)

            locationDict[key] = sorted(newlist,
                                       key=operator.itemgetter('lowestRun'))

        totalJobs = 0
        lastLumi = None
        firstLumi = None
        lastRun = None
        lumisInJob = 0
        totalAvgEventCount = 0
        currentJobAvgEventCount = 0
        stopTask = False
        self.lumiChecker = LumiChecker(applyLumiCorrection)
        for location in locationDict:

            # For each location, we need a new jobGroup
            self.newGroup()
            stopJob = True
            for f in locationDict[location]:

                if getParents:
                    parentLFNs = self.findParent(lfn=f['lfn'])
                    for lfn in parentLFNs:
                        parent = File(lfn=lfn)
                        f['parents'].add(parent)

                lumisInJobInFile = 0
                updateSplitOnJobStop = False
                failNextJob = False
                # If estimated job time is higher the job time limit (condor limit)
                # and it's only one lumi then ditch that lumi
                timePerLumi = f['avgEvtsPerLumi'] * timePerEvent
                if timePerLumi > jobTimeLimit and f['lumiCount'] == 1:
                    failNextJob = True
                    stopJob = True
                    lumisPerJob = 1
                elif splitOnFile:
                    # Then we have to split on every boundary
                    stopJob = True
                    # Check the average number of events per lumi in this file
                    # Adapt the lumis per job to match the target conditions
                    if f['avgEvtsPerLumi']:
                        # If there are events in the file
                        ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi']
                        lumisPerJob = max(int(math.floor(ratio)), 1)
                    else:
                        # Zero event file, then the ratio goes to infinity. Computers don't like that
                        lumisPerJob = f['lumiCount']
                else:
                    # Analyze how many events does this job already has
                    # Check how many we want as target, include as many lumi sections as possible
                    updateSplitOnJobStop = True
                    eventsRemaining = max(
                        avgEventsPerJob - currentJobAvgEventCount, 0)
                    if f['avgEvtsPerLumi']:
                        lumisAllowed = int(
                            math.floor(
                                float(eventsRemaining) / f['avgEvtsPerLumi']))
                    else:
                        lumisAllowed = f['lumiCount']
                    lumisPerJob = max(lumisInJob + lumisAllowed, 1)

                for run in f['runs']:
                    if not isGoodRun(goodRunList=goodRunList, run=run.run):
                        # Then skip this one
                        continue
                    if len(runWhitelist) > 0 and not run.run in runWhitelist:
                        # Skip due to run whitelist
                        continue
                    firstLumi = None

                    if splitOnRun and run.run != lastRun:
                        # Then we need to kill this job and get a new one
                        stopJob = True

                    # Now loop over the lumis
                    for lumi in run:
                        if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi)
                                or self.lumiChecker.isSplitLumi(
                                    run.run, lumi, f)):
                            # Kill the chain of good lumis
                            # Skip this lumi
                            if firstLumi != None and firstLumi != lumi:
                                self.currentJob['mask'].addRunAndLumis(
                                    run=run.run, lumis=[firstLumi, lastLumi])
                                eventsAdded = ((lastLumi - firstLumi + 1) *
                                               f['avgEvtsPerLumi'])
                                runAddedTime = eventsAdded * timePerEvent
                                runAddedSize = eventsAdded * sizePerEvent
                                self.currentJob.addResourceEstimates(
                                    jobTime=runAddedTime, disk=runAddedSize)
                                firstLumi = None
                                lastLumi = None
                            continue

                        # You have to kill the lumi chain if they're not continuous
                        if lastLumi and not lumi == lastLumi + 1:
                            self.currentJob['mask'].addRunAndLumis(
                                run=run.run, lumis=[firstLumi, lastLumi])
                            eventsAdded = ((lastLumi - firstLumi + 1) *
                                           f['avgEvtsPerLumi'])
                            runAddedTime = eventsAdded * timePerEvent
                            runAddedSize = eventsAdded * sizePerEvent
                            self.currentJob.addResourceEstimates(
                                jobTime=runAddedTime, disk=runAddedSize)
                            firstLumi = None
                            lastLumi = None

                        if firstLumi is None:
                            # Set the first lumi in the run
                            firstLumi = lumi

                        # If we're full, end the job
                        if lumisInJob == lumisPerJob:
                            stopJob = True
                        # Actually do the new job creation
                        if stopJob:
                            if firstLumi != None and lastLumi != None and lastRun != None:
                                self.currentJob['mask'].addRunAndLumis(
                                    run=lastRun, lumis=[firstLumi, lastLumi])
                                eventsAdded = ((lastLumi - firstLumi + 1) *
                                               f['avgEvtsPerLumi'])
                                runAddedTime = eventsAdded * timePerEvent
                                runAddedSize = eventsAdded * sizePerEvent
                                self.currentJob.addResourceEstimates(
                                    jobTime=runAddedTime, disk=runAddedSize)
                            msg = None
                            if failNextJob:
                                msg = "File %s has a single lumi %s, in run %s " % (
                                    f['lfn'], lumi, run.run)
                                msg += "with too many events %d and it woud take %d sec to run" \
                                       % (f['events'], timePerLumi)
                            self.lumiChecker.closeJob(self.currentJob)
                            self.newJob(name=self.getJobName(),
                                        failedJob=failNextJob,
                                        failedReason=msg)
                            if deterministicPileup:
                                skipEvents = (
                                    self.nJobs -
                                    1) * lumisPerJob * eventsPerLumiInDataset
                                self.currentJob.addBaggageParameter(
                                    "skipPileupEvents", skipEvents)
                            self.currentJob.addResourceEstimates(
                                memory=memoryRequirement)
                            failNextJob = False
                            firstLumi = lumi
                            lumisInJob = 0
                            lumisInJobInFile = 0
                            currentJobAvgEventCount = 0
                            totalJobs += 1
                            if jobLimit and totalJobs > jobLimit:
                                msg = "Job limit of {0} jobs exceeded.".format(
                                    jobLimit)
                                raise RuntimeError(msg)

                            # Add the file to new jobs
                            self.currentJob.addFile(f)

                            if updateSplitOnJobStop:
                                # Then we were carrying from a previous file
                                # Reset calculations for this file
                                updateSplitOnJobStop = False
                                if f['avgEvtsPerLumi']:
                                    ratio = float(
                                        avgEventsPerJob) / f['avgEvtsPerLumi']
                                    lumisPerJob = max(int(math.floor(ratio)),
                                                      1)
                                else:
                                    lumisPerJob = f['lumiCount']

                        lumisInJob += 1
                        lumisInJobInFile += 1
                        lastLumi = lumi
                        stopJob = False
                        lastRun = run.run
                        totalAvgEventCount += f['avgEvtsPerLumi']

                        if self.currentJob and not f in self.currentJob[
                                'input_files']:
                            self.currentJob.addFile(f)

                        # We stop here if there are more total events than requested.
                        if totalEvents > 0 and totalAvgEventCount >= totalEvents:
                            stopTask = True
                            break

                    if firstLumi != None and lastLumi != None:
                        # Add this run to the mask
                        self.currentJob['mask'].addRunAndLumis(
                            run=run.run, lumis=[firstLumi, lastLumi])
                        eventsAdded = ((lastLumi - firstLumi + 1) *
                                       f['avgEvtsPerLumi'])
                        runAddedTime = eventsAdded * timePerEvent
                        runAddedSize = eventsAdded * sizePerEvent
                        self.currentJob.addResourceEstimates(
                            jobTime=runAddedTime, disk=runAddedSize)
                        firstLumi = None
                        lastLumi = None

                    if stopTask:
                        break

                if not splitOnFile:
                    currentJobAvgEventCount += f[
                        'avgEvtsPerLumi'] * lumisInJobInFile

                if stopTask:
                    break

            if stopTask:
                break

        self.lumiChecker.closeJob(self.currentJob)
        self.lumiChecker.fixInputFiles()
        return
Exemplo n.º 41
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        lumisPerJob = int(kwargs.get('lumis_per_job', 1))
        totalLumis = int(kwargs.get('total_lumis', 0))
        splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', False))
        self.collectionName = kwargs.get('collectionName', None)
        splitOnRun = kwargs.get('splitOnRun', True)
        getParents = kwargs.get('include_parents', False)
        runWhitelist = kwargs.get('runWhitelist', [])
        runs = kwargs.get('runs', None)
        lumis = kwargs.get('lumis', None)
        deterministicPileup = kwargs.get('deterministicPileup', False)
        applyLumiCorrection = bool(kwargs.get('applyLumiCorrection', False))
        eventsPerLumiInDataset = 0

        if self.package == 'WMCore.WMBS':
            self.loadRunLumi = self.daoFactory(
                classname="Files.GetBulkRunLumi")
            if deterministicPileup:
                getJobNumber = self.daoFactory(
                    classname="Jobs.GetNumberOfJobsPerWorkflow")
                self.nJobs = getJobNumber.execute(
                    workflow=self.subscription.getWorkflow().id)

        timePerEvent, sizePerEvent, memoryRequirement = \
            self.getPerformanceParameters(kwargs.get('performance', {}))

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if self.collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')

                logging.info('Creating jobs for ACDC fileset %s', filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(self.collectionName,
                                                   filesetName)
            except Exception as ex:
                msg = "Exception while trying to load goodRunList. "
                msg += "Refusing to create any jobs.\nDetails: %s" % str(ex)
                logging.exception(msg)
                return

        lDict = self.getFilesSortedByLocation(lumisPerJob)
        if not lDict:
            logging.info(
                "There are not enough lumis/files to be splitted. Trying again next cycle"
            )
            return
        locationDict = {}
        for key in lDict.keys():
            newlist = []
            for f in lDict[key]:
                # if hasattr(f, 'loadData'):
                #    f.loadData()
                if len(f['runs']) == 0:
                    continue
                f['lumiCount'] = 0
                f['runs'] = sorted(f['runs'])
                for run in f['runs']:
                    run.lumis.sort()
                    f['lumiCount'] += len(run.lumis)
                f['lowestRun'] = f['runs'][0]
                # Do average event per lumi calculation
                if f['lumiCount']:
                    f['avgEvtsPerLumi'] = round(
                        float(f['events']) / f['lumiCount'])
                    if deterministicPileup:
                        # We assume that all lumis are equal in the dataset
                        eventsPerLumiInDataset = f['avgEvtsPerLumi']
                else:
                    # No lumis in the file, ignore it
                    continue
                newlist.append(f)
            locationDict[key] = sorted(newlist,
                                       key=operator.itemgetter('lowestRun'))

        # Split files into jobs with each job containing
        # EXACTLY lumisPerJob number of lumis (except for maybe the last one)

        totalJobs = 0
        lastLumi = None
        firstLumi = None
        stopJob = True
        stopTask = False
        lastRun = None
        lumisInJob = 0
        lumisInTask = 0
        self.lumiChecker = LumiChecker(applyLumiCorrection)
        for location in locationDict.keys():

            # For each location, we need a new jobGroup
            self.newGroup()
            stopJob = True
            for f in locationDict[location]:
                if getParents:
                    parentLFNs = self.findParent(lfn=f['lfn'])
                    for lfn in parentLFNs:
                        parent = File(lfn=lfn)
                        f['parents'].add(parent)

                if splitOnFile:
                    # Then we have to split on every boundary
                    stopJob = True

                for run in f['runs']:
                    if not isGoodRun(goodRunList=goodRunList, run=run.run):
                        # Then skip this one
                        continue
                    if len(runWhitelist) > 0 and not run.run in runWhitelist:
                        # Skip due to run whitelist
                        continue
                    firstLumi = None

                    if splitOnRun and run.run != lastRun:
                        # Then we need to kill this job and get a new one
                        stopJob = True

                    # Now loop over the lumis
                    for lumi in run:
                        # splitLumi checks if the lumi is split across jobs
                        if (not isGoodLumi(goodRunList, run=run.run, lumi=lumi)
                                or self.lumiChecker.isSplitLumi(
                                    run.run, lumi, f)):
                            # Kill the chain of good lumis
                            # Skip this lumi
                            if firstLumi != None and firstLumi != lumi:
                                self.currentJob['mask'].addRunAndLumis(
                                    run=run.run, lumis=[firstLumi, lastLumi])
                                addedEvents = ((lastLumi - firstLumi + 1) *
                                               f['avgEvtsPerLumi'])
                                runAddedTime = addedEvents * timePerEvent
                                runAddedSize = addedEvents * sizePerEvent
                                self.currentJob.addResourceEstimates(
                                    jobTime=runAddedTime, disk=runAddedSize)
                                firstLumi = None
                                lastLumi = None
                            continue

                        # You have to kill the lumi chain if they're not continuous
                        if lastLumi and not lumi == lastLumi + 1:
                            self.currentJob['mask'].addRunAndLumis(
                                run=run.run, lumis=[firstLumi, lastLumi])
                            addedEvents = ((lastLumi - firstLumi + 1) *
                                           f['avgEvtsPerLumi'])
                            runAddedTime = addedEvents * timePerEvent
                            runAddedSize = addedEvents * sizePerEvent
                            self.currentJob.addResourceEstimates(
                                jobTime=runAddedTime, disk=runAddedSize)
                            firstLumi = None
                            lastLumi = None

                        if firstLumi is None:
                            # Set the first lumi in the run
                            firstLumi = lumi

                        # If we're full, end the job
                        if lumisInJob == lumisPerJob:
                            stopJob = True
                        # Actually do the new job creation
                        if stopJob:
                            if firstLumi != None and lastLumi != None and lastRun != None:
                                self.currentJob['mask'].addRunAndLumis(
                                    run=lastRun, lumis=[firstLumi, lastLumi])
                                addedEvents = ((lastLumi - firstLumi + 1) *
                                               f['avgEvtsPerLumi'])
                                runAddedTime = addedEvents * timePerEvent
                                runAddedSize = addedEvents * sizePerEvent
                                self.currentJob.addResourceEstimates(
                                    jobTime=runAddedTime, disk=runAddedSize)
                            # before creating a new job add the lumis of the current one to the checker
                            self.lumiChecker.closeJob(self.currentJob)
                            self.newJob(name=self.getJobName())
                            self.currentJob.addResourceEstimates(
                                memory=memoryRequirement)
                            if deterministicPileup:
                                skipEvents = (
                                    self.nJobs -
                                    1) * lumisPerJob * eventsPerLumiInDataset
                                self.currentJob.addBaggageParameter(
                                    "skipPileupEvents", skipEvents)
                            firstLumi = lumi
                            lumisInJob = 0
                            totalJobs += 1

                            # Add the file to new jobs
                            self.currentJob.addFile(f)

                        lumisInJob += 1
                        lumisInTask += 1
                        lastLumi = lumi
                        stopJob = False
                        lastRun = run.run

                        if self.currentJob and not f in self.currentJob[
                                'input_files']:
                            self.currentJob.addFile(f)

                        if totalLumis > 0 and lumisInTask >= totalLumis:
                            stopTask = True
                            break

                    if firstLumi != None and lastLumi != None:
                        # Add this run to the mask
                        self.currentJob['mask'].addRunAndLumis(
                            run=run.run, lumis=[firstLumi, lastLumi])
                        addedEvents = ((lastLumi - firstLumi + 1) *
                                       f['avgEvtsPerLumi'])
                        runAddedTime = addedEvents * timePerEvent
                        runAddedSize = addedEvents * sizePerEvent
                        self.currentJob.addResourceEstimates(
                            jobTime=runAddedTime, disk=runAddedSize)
                        firstLumi = None
                        lastLumi = None

                    if stopTask:
                        break

                if stopTask:
                    break

            if stopTask:
                break

        self.lumiChecker.closeJob(self.currentJob)
        self.lumiChecker.fixInputFiles()
        return
Exemplo n.º 42
0
class ErrorHandlerTest(unittest.TestCase):
    """
    TestCase for TestErrorHandler module 
    """
    def setUp(self):
        """
        setup for test.
        """
        myThread = threading.currentThread()
        
        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase = True)
        self.testInit.setSchema(customModules = ["WMCore.WMBS"],
                                useDefault = False)
        self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC")
        self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump")
        self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump")

        self.daofactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.getJobs = self.daofactory(classname = "Jobs.GetAllJobs")
        self.setJobTime = self.daofactory(classname = "Jobs.SetStateTime")
        locationAction = self.daofactory(classname = "Locations.New")
        locationAction.execute(siteName = "malpaquet", seName = "malpaquet")
        self.testDir = self.testInit.generateWorkDir()
        self.nJobs = 10

        self.dataCS = DataCollectionService(url = self.testInit.couchUrl,
                                            database = "errorhandler_t")
        
        return

    def tearDown(self):
        """
        Database deletion
        """
        self.testInit.clearDatabase()
        self.testInit.delWorkDir()
        self.testInit.tearDownCouch()
        return

    def getConfig(self):
        """
        _getConfig_

        """
        config = Configuration()

        # First the general stuff
        config.section_("General")
        config.General.workDir = os.getenv("TESTDIR", self.testDir)
        config.section_("CoreDatabase")
        config.CoreDatabase.connectUrl = os.getenv("DATABASE")
        config.CoreDatabase.socket     = os.getenv("DBSOCK")

        config.component_("ErrorHandler")
        # The log level of the component. 
        config.ErrorHandler.logLevel = 'DEBUG'
        # The namespace of the component
        config.ErrorHandler.namespace = 'WMComponent.ErrorHandler.ErrorHandler'
        # maximum number of threads we want to deal
        # with messages per pool.
        config.ErrorHandler.maxThreads = 30
        # maximum number of retries we want for job
        config.ErrorHandler.maxRetries = 5
        # The poll interval at which to look for failed jobs
        config.ErrorHandler.pollInterval = 60

        # JobStateMachine
        config.component_('JobStateMachine')
        config.JobStateMachine.couchurl        = os.getenv('COUCHURL', None)
        config.JobStateMachine.couchDBName     = "errorhandler_t_jd"


        config.section_('ACDC')
        config.ACDC.couchurl = self.testInit.couchUrl
        config.ACDC.database = "errorhandler_t"

        return config


    def createWorkload(self, workloadName = 'Test', emulator = True):
        """
        _createTestWorkload_

        Creates a test workload for us to run on, hold the basic necessities.
        """

        workload = testWorkload("Tier1ReReco")
        rereco = workload.getTask("ReReco")

        # Add RequestManager stuff
        workload.data.request.section_('schema')
        workload.data.request.schema.Requestor = 'nobody'
        workload.data.request.schema.Group     = 'testers'
        
        
        taskMaker = TaskMaker(workload, os.path.join(self.testDir, 'workloadTest'))
        taskMaker.skipSubscription = True
        taskMaker.processWorkload()

        return workload

    

    def createTestJobGroup(self, nJobs = 10, retry_count = 1,
                           workloadPath = 'test', fwjrPath = None,
                           workloadName = makeUUID()):
        """
        Creates a group of several jobs
        """


        myThread = threading.currentThread()
        myThread.transaction.begin()
        testWorkflow = Workflow(spec = workloadPath, owner = "cmsdataops", group = "cmsdataops",
                                name = workloadName, task="/TestWorkload/ReReco")
        testWorkflow.create()
        
        testWMBSFileset = Fileset(name = "TestFileset")
        testWMBSFileset.create()
        
        testSubscription = Subscription(fileset = testWMBSFileset,
                                        workflow = testWorkflow)
        testSubscription.create()

        testJobGroup = JobGroup(subscription = testSubscription)
        testJobGroup.create()

        testFile0 = File(lfn = "/this/is/a/parent", size = 1024, events = 10)
        testFile0.addRun(Run(10, *[12312]))
        testFile0.setLocation('malpaquet')

        testFileA = File(lfn = "/this/is/a/lfnA", size = 1024, events = 10,
                         first_event = 88, last_event = 99)
        testFileA.addRun(Run(10, *[12312, 12313]))
        testFileA.setLocation('malpaquet')

        testFileB = File(lfn = "/this/is/a/lfnB", size = 1024, events = 10,
                         first_event = 88, last_event = 99)
        testFileB.addRun(Run(10, *[12314, 12315, 12316]))
        testFileB.setLocation('malpaquet')

        testFile0.create()
        testFileA.create()
        testFileB.create()

        testFileA.addParent(lfn = "/this/is/a/parent")
        testFileB.addParent(lfn = "/this/is/a/parent")

        for i in range(0, nJobs):
            testJob = Job(name = makeUUID())
            testJob['retry_count'] = retry_count
            testJob['retry_max'] = 10
            testJob['mask'].addRunAndLumis(run = 10, lumis = [12312])
            testJob['mask'].addRunAndLumis(run = 10, lumis = [12314, 12316])
            testJob['mask']['FirstEvent'] = 100
            testJob['cache_dir'] = os.path.join(self.testDir, testJob['name'])
            testJob['fwjr_path'] = fwjrPath
            os.mkdir(testJob['cache_dir'])
            testJobGroup.add(testJob)
            testJob.create(group = testJobGroup)
            testJob.addFile(testFileA)
            testJob.addFile(testFileB)
            testJob.save()

        
        testJobGroup.commit()


        testSubscription.acquireFiles(files = [testFileA, testFileB])
        testSubscription.save()
        myThread.transaction.commit()
        
        return testJobGroup


    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()
        
        Mimics creation of component and test jobs failed in create stage.
        """

        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')
        
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath,
                                               workloadName = workloadName)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), self.nJobs)

        changer.propagate(testJobGroup.jobs, 'new', 'CreateCooloff')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        # Now exhaust them
        for job in testJobGroup.jobs:
            job['retry_count'] = 6
            job.save()
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['lumis'] in [[12312], [12314, 12315, 12316]],
                                "Unknown lumi %s" % f['runs'][0]['lumis'])
                self.assertTrue(f['merged'], 1)
                self.assertTrue(f['first_event'], 88)
                self.assertTrue(f['last_event'], 99)
            self.assertEqual(counter, 20)
        return

    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()
        
        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')
        
        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return

    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return


    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs, retry_count = 5,
                                               workloadPath = workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id = 1) # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)


        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        

        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)

    def testE_FailJobs(self):
        """
        _FailJobs_

        Test our ability to fail jobs based on the information in the FWJR
        """
        workloadName = 'TestWorkload'
        
        workload = self.createWorkload(workloadName = workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', 'TestWorkload',
                                    'WMSandbox', 'WMWorkload.pkl')

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        testJobGroup = self.createTestJobGroup(nJobs = self.nJobs,
                                               workloadPath = workloadPath,
                                               fwjrPath = fwjrPath)
        
        config = self.getConfig()
        config.ErrorHandler.readFWJR         = True
        config.ErrorHandler.failureExitCodes = [8020]
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        # This should exhaust all jobs due to exit code
        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.failureExitCodes = []
        config.ErrorHandler.maxFailTime      = -10
        testErrorHandler2 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler2.algorithm(None)

        # This should exhaust all jobs due to timeout
        idList = self.getJobs.execute(state = 'JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'JobCooloff')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state = 'Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.maxFailTime      = 24 * 3600
        config.ErrorHandler.passExitCodes    = [8020]
        testErrorHandler3 = ErrorHandlerPoller(config)

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler3.algorithm(None)

        idList = self.getJobs.execute(state = 'Created')
        self.assertEqual(len(idList), self.nJobs)

        return

        



    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        return

        import cProfile, pstats

        nJobs = 1000

        testJobGroup = self.createTestJobGroup(nJobs = nJobs)
        
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'createfailed', 'new')

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        testErrorHandler.setup(None)
        startTime = time.time()
        #cProfile.runctx("testErrorHandler.algorithm()", globals(), locals(), filename = "profStats.stat")
        testErrorHandler.algorithm()
        stopTime = time.time()

        idList = self.getJobs.execute(state = 'CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state = 'CreateCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)
        
        return
Exemplo n.º 43
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        avgEventsPerJob = int(kwargs.get('events_per_job', 5000))
        eventLimit      = int(kwargs.get('max_events_per_lumi', 20000))
        totalEvents     = int(kwargs.get('total_events', 0))
        splitOnFile     = bool(kwargs.get('halt_job_on_file_boundaries', True))
        ignoreACDC      = bool(kwargs.get('ignore_acdc_except', False))
        collectionName  = kwargs.get('collectionName', None)
        splitOnRun      = kwargs.get('splitOnRun', True)
        getParents      = kwargs.get('include_parents', False)
        runWhitelist    = kwargs.get('runWhitelist', [])
        runs            = kwargs.get('runs', None)
        lumis           = kwargs.get('lumis', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))
        deterministicPileup = kwargs.get('deterministicPileup', False)
        eventsPerLumiInDataset = 0

        if deterministicPileup and self.package == 'WMCore.WMBS':
            getJobNumber = self.daoFactory(classname = "Jobs.GetNumberOfJobsPerWorkflow")
            jobNumber = getJobNumber.execute(workflow = self.subscription.getWorkflow().id)
            self.nJobs = jobNumber

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL       = kwargs.get('couchURL')
                couchDB        = kwargs.get('couchDB')
                filesetName    = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner          = kwargs.get('owner')
                group          = kwargs.get('group')

                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(collectionName, filesetName, owner, group)
            except Exception as ex:
                msg =  "Exception while trying to load goodRunList\n"
                if ignoreACDC:
                    msg +=  "Ditching goodRunList\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    goodRunList = {}
                else:
                    msg +=  "Refusing to create any jobs.\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    return

        lDict = self.sortByLocation()
        locationDict = {}

        # First we need to load the data
        if self.package == 'WMCore.WMBS':
            loadRunLumi = self.daoFactory(classname = "Files.GetBulkRunLumi")

        for key in lDict.keys():
            newlist = []
            # First we need to load the data
            if self.package == 'WMCore.WMBS':
                fileLumis = loadRunLumi.execute(files = lDict[key])
                for f in lDict[key]:
                    lumiDict = fileLumis.get(f['id'], {})
                    for run in lumiDict.keys():
                        f.addRun(run = Run(run, *lumiDict[run]))

            for f in lDict[key]:
                if len(f['runs']) == 0:
                    continue
                f['runs'] = sorted(f['runs'])
                f['lumiCount'] = 0
                for run in f['runs']:
                    run.lumis.sort()
                    f['lumiCount'] += len(run.lumis)
                f['lowestRun'] = f['runs'][0]

                #Do average event per lumi calculation
                if f['lumiCount']:
                    f['avgEvtsPerLumi'] = round(float(f['events'])/f['lumiCount'])
                    if deterministicPileup:
                        # We assume that all lumis are equal in the dataset
                        eventsPerLumiInDataset = f['avgEvtsPerLumi']
                else:
                    #No lumis in the file, ignore it
                    continue
                newlist.append(f)


            locationDict[key] = sorted(newlist, key=operator.itemgetter('lowestRun'))

        totalJobs      = 0
        lastLumi       = None
        firstLumi      = None
        lastRun        = None
        lumisInJob     = 0
        totalAvgEventCount = 0
        currentJobAvgEventCount = 0
        stopTask = False
        for location in locationDict:

            # For each location, we need a new jobGroup
            self.newGroup()
            stopJob = True
            for f in locationDict[location]:

                if getParents:
                    parentLFNs = self.findParent(lfn = f['lfn'])
                    for lfn in parentLFNs:
                        parent = File(lfn = lfn)
                        f['parents'].add(parent)

                lumisInJobInFile = 0
                updateSplitOnJobStop = False
                failNextJob          = False
                #If the number of events per lumi is higher than the limit
                #and it's only one lumi then ditch that lumi
                if f['avgEvtsPerLumi'] > eventLimit and f['lumiCount'] == 1:
                    failNextJob = True
                    stopJob = True
                    lumisPerJob = 1
                elif splitOnFile:
                    # Then we have to split on every boundary
                    stopJob = True
                    #Check the average number of events per lumi in this file
                    #Adapt the lumis per job to match the target conditions
                    if f['avgEvtsPerLumi']:
                        #If there are events in the file
                        ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi']
                        lumisPerJob = max(int(math.floor(ratio)), 1)
                    else:
                        #Zero event file, then the ratio goes to infinity. Computers don't like that
                        lumisPerJob = f['lumiCount']
                else:
                    #Analyze how many events does this job already has
                    #Check how many we want as target, include as many lumi sections as possible
                    updateSplitOnJobStop = True
                    eventsRemaining = max(avgEventsPerJob - currentJobAvgEventCount, 0)
                    if f['avgEvtsPerLumi']:
                        lumisAllowed = int(math.floor(float(eventsRemaining) / f['avgEvtsPerLumi']))
                    else:
                        lumisAllowed = f['lumiCount']
                    lumisPerJob = max(lumisInJob + lumisAllowed, 1)

                for run in f['runs']:
                    if not isGoodRun(goodRunList = goodRunList, run = run.run):
                        # Then skip this one
                        continue
                    if len(runWhitelist) > 0 and not run.run in runWhitelist:
                        # Skip due to run whitelist
                        continue
                    firstLumi = None

                    if splitOnRun and run.run != lastRun:
                        # Then we need to kill this job and get a new one
                        stopJob = True

                    # Now loop over the lumis
                    for lumi in run:
                        if not isGoodLumi(goodRunList, run = run.run, lumi = lumi):
                            # Kill the chain of good lumis
                            # Skip this lumi
                            if firstLumi != None and firstLumi != lumi:
                                self.currentJob['mask'].addRunAndLumis(run = run.run,
                                                                       lumis = [firstLumi, lastLumi])
                                eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi'])
                                runAddedTime = eventsAdded * timePerEvent
                                runAddedSize = eventsAdded * sizePerEvent
                                self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize)
                                firstLumi = None
                                lastLumi = None
                            continue

                        # You have to kill the lumi chain if they're not continuous
                        if lastLumi and not lumi == lastLumi + 1:
                            self.currentJob['mask'].addRunAndLumis(run = run.run,
                                                                   lumis = [firstLumi, lastLumi])
                            eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi'])
                            runAddedTime = eventsAdded * timePerEvent
                            runAddedSize = eventsAdded * sizePerEvent
                            self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize)
                            firstLumi = None
                            lastLumi = None

                        if firstLumi == None:
                            # Set the first lumi in the run
                            firstLumi = lumi

                        # If we're full, end the job
                        if lumisInJob == lumisPerJob:
                            stopJob = True
                        # Actually do the new job creation
                        if stopJob:
                            if firstLumi != None and lastLumi != None and lastRun != None:
                                self.currentJob['mask'].addRunAndLumis(run = lastRun,
                                                                       lumis = [firstLumi, lastLumi])
                                eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi'])
                                runAddedTime = eventsAdded * timePerEvent
                                runAddedSize = eventsAdded * sizePerEvent
                                self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize)
                            msg = None
                            if failNextJob:
                                msg = "File %s has too many events (%d) in %d lumi(s)" % (f['lfn'],
                                                                                          f['events'],
                                                                                          f['lumiCount'])
                            self.newJob(name = self.getJobName(), failedJob = failNextJob,
                                        failedReason = msg)
                            if deterministicPileup:
                                self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * lumisPerJob * eventsPerLumiInDataset)
                            self.currentJob.addResourceEstimates(memory = memoryRequirement)
                            failNextJob = False
                            firstLumi = lumi
                            lumisInJob = 0
                            lumisInJobInFile = 0
                            currentJobAvgEventCount = 0
                            totalJobs += 1

                            # Add the file to new jobs
                            self.currentJob.addFile(f)

                            if updateSplitOnJobStop:
                                #Then we were carrying from a previous file
                                #Reset calculations for this file
                                updateSplitOnJobStop = False
                                if f['avgEvtsPerLumi']:
                                    ratio = float(avgEventsPerJob) / f['avgEvtsPerLumi']
                                    lumisPerJob = max(int(math.floor(ratio)), 1)
                                else:
                                    lumisPerJob = f['lumiCount']

                        lumisInJob += 1
                        lumisInJobInFile += 1
                        lastLumi = lumi
                        stopJob = False
                        lastRun = run.run
                        totalAvgEventCount += f['avgEvtsPerLumi']

                        if self.currentJob and not f in self.currentJob['input_files']:
                            self.currentJob.addFile(f)

                        # We stop here if there are more total events than requested.
                        if totalEvents > 0 and totalAvgEventCount >= totalEvents:
                            stopTask = True
                            break

                    if firstLumi != None and lastLumi != None:
                        # Add this run to the mask
                        self.currentJob['mask'].addRunAndLumis(run = run.run,
                                                               lumis = [firstLumi, lastLumi])
                        eventsAdded = ((lastLumi - firstLumi + 1) * f['avgEvtsPerLumi'])
                        runAddedTime = eventsAdded * timePerEvent
                        runAddedSize = eventsAdded * sizePerEvent
                        self.currentJob.addResourceEstimates(jobTime = runAddedTime, disk = runAddedSize)
                        firstLumi = None
                        lastLumi = None

                    if stopTask:
                        break

                if not splitOnFile:
                    currentJobAvgEventCount += f['avgEvtsPerLumi'] * lumisInJobInFile

                if stopTask:
                    break

            if stopTask:
                break

        return
Exemplo n.º 44
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)

        self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available    
        self.initAlerts(compName = "ErrorHandler")        
        
        return
    
    def setup(self, parameters):
        """
        Load DB objects required for queries
        """

        # For now, does nothing

        return

        

    def terminate(self, params):
        """
        _terminate_
        
        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)


    def processRetries(self, jobs, jobType):
        """
        Actually do the retries

        """
        logging.info("Processing retries for %i failed jobs of type %s." % (len(jobs), jobType))
        exhaustJobs = []
        cooloffJobs = []

	# Retries < max retry count
        for ajob in jobs:
            # Retries < max retry count
            if ajob['retry_count'] < self.maxRetries:
                cooloffJobs.append(ajob)
            # Check if Retries >= max retry count
            elif ajob['retry_count'] >= self.maxRetries:
                exhaustJobs.append(ajob)
                msg = "Exhausting job %i" % ajob['id']
                logging.error(msg)
                self.sendAlert(6, msg = msg)
                logging.debug("JobInfo: %s" % ajob)
            else:
                logging.debug("Job %i had %s retries remaining" \
                              % (ajob['id'], str(ajob['retry_count'])))

        #Now to actually do something.
        logging.debug("About to propagate jobs")
        self.changeState.propagate(exhaustJobs, 'exhausted', \
                                   '%sfailed' %(jobType))
        self.changeState.propagate(cooloffJobs, '%scooloff' %(jobType), \
                                   '%sfailed' %(jobType))

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in exhaustJobs:
            job.failInputFiles()

        return exhaustJobs


    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        logging.debug("Entering ACDC with %i jobs" % len(jobList))
        for job in jobList:
            job.getMask()

        self.dataCollection.failedJobs(jobList)
        return

    def splitJobList(self, jobList, jobType):
        """
        _splitJobList_

        Split up list of jobs into more manageable chunks if necessary
        """
        if len(jobList) < 1:
            # Nothing to do
            return

        myThread = threading.currentThread()

        while len(jobList) > 0:
            # Loop over the list and handle it one chunk at a time
            tmpList = jobList[:self.maxProcessSize]
            jobList = jobList[self.maxProcessSize:]
            logging.debug("About to process %i errors" % len(tmpList))
            myThread.transaction.begin()
            exhaustList = self.processRetries(tmpList, jobType)
            self.handleACDC(jobList = exhaustList)
            myThread.transaction.commit()

        return
            

            

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """

        createList = []
        submitList = []
        jobList    = []

        # Run over created jobs
        idList = self.getJobs.execute(state = 'CreateFailed')
        logging.info("Found %s failed jobs failed during creation" \
                     % len(idList))
        if len(idList) > 0:
            createList = self.loadJobsFromList(idList = idList)

        # Run over submitted jobs
        idList = self.getJobs.execute(state = 'SubmitFailed')
        logging.info("Found %s failed jobs failed during submit" \
                     % len(idList))
        if len(idList) > 0:
            submitList = self.loadJobsFromList(idList = idList)

        # Run over executed jobs
        idList = self.getJobs.execute(state = 'JobFailed')
        logging.info("Found %s failed jobs failed during execution" \
                     % len(idList))
        if len(idList) > 0:
            jobList = self.loadJobsFromList(idList = idList)

        self.splitJobList(jobList = createList, jobType = 'create')
        self.splitJobList(jobList = submitList, jobType = 'submit')
        self.splitJobList(jobList = jobList,    jobType = 'job')

        return


    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """

        loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")


        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = loadAction.execute(jobID = binds)

        # You have to have a list
        if type(results) == dict:
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id = entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)


        return listOfJobs


    def algorithm(self, parameters = None):
        """
	Performs the handleErrors method, looking for each type of failure
	And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        myThread = threading.currentThread()
        try:
            self.handleErrors()
        except WMException, ex:
            try:
                myThread.transaction.rollback()
            except:
                pass
            raise
        except Exception, ex:
            msg = "Caught exception in ErrorHandler\n"
            msg += str(ex)
            msg += str(traceback.format_exc())
            msg += "\n\n"
            logging.error(msg)
            self.sendAlert(6, msg = msg)
            if getattr(myThread, 'transaction', None) != None \
               and getattr(myThread.transaction, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise ErrorHandlerException(msg)
Exemplo n.º 45
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer",
                                        logger = myThread.logger,
                                        dbinterface = myThread.dbi)

        self.getOutputMapAction      = self.daofactory(classname = "Jobs.GetOutputMap")
        self.bulkAddToFilesetAction  = self.daofactory(classname = "Fileset.BulkAddByLFN")
        self.bulkParentageAction     = self.daofactory(classname = "Files.AddBulkParentage")
        self.getJobTypeAction        = self.daofactory(classname = "Jobs.GetType")
        self.getParentInfoAction     = self.daofactory(classname = "Files.GetParentInfo")
        self.setParentageByJob       = self.daofactory(classname = "Files.SetParentageByJob")
        self.setParentageByMergeJob  = self.daofactory(classname = "Files.SetParentageByMergeJob")
        self.setFileRunLumi          = self.daofactory(classname = "Files.AddRunLumi")
        self.setFileLocation         = self.daofactory(classname = "Files.SetLocationByLFN")
        self.setFileAddChecksum      = self.daofactory(classname = "Files.AddChecksumByLFN")
        self.addFileAction           = self.daofactory(classname = "Files.Add")
        self.jobCompleteInput        = self.daofactory(classname = "Jobs.CompleteInput")
        self.setBulkOutcome          = self.daofactory(classname = "Jobs.SetOutcomeBulk")
        self.getWorkflowSpec         = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID          = self.daofactory(classname = "Jobs.LoadFromID")
        self.getFullJobInfo          = self.daofactory(classname = "Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction    = self.daofactory(classname = "Jobs.GetFWJRTaskName")

        self.dbsStatusAction       = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction     = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren")
        self.dbsCreateFiles        = self.dbsDaoFactory(classname = "DBSBufferFiles.Add")
        self.dbsSetLocation        = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation     = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation")
        self.dbsSetChecksum        = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi         = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow        = self.dbsDaoFactory(classname = "ListWorkflow")

        self.dbsLFNHeritage      = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # ACDC service
        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb  = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate  = []
        self.wmbsFilesToBuild  = []
        self.wmbsMergeFilesToBuild  = []
        self.fileLocation      = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave  = []
        self.listOfJobsToFail  = []
        self.filesetAssoc      = []
        self.parentageBinds    = []
        self.parentageBindsForMerge    = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID     = collections.deque(maxlen = 1000)
        self.datasetAlgoPaths  = collections.deque(maxlen = 1000)
        self.dbsLocations      = set()
        self.workflowIDs       = collections.deque(maxlen = 1000)
        self.workflowPaths     = collections.deque(maxlen = 1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()


        return
Exemplo n.º 46
0
    def testChunking(self):
        """
        _testChunking_

        Insert a workload and files that have several distinct sets of
        locations.  Verify that the chunks are created correctly and that they
        only groups files that have the same set of locations.  Also verify that
        the chunks are pulled out of ACDC correctly.
        """
        dcs = DataCollectionService(url=self.testInit.couchUrl,
                                    database="wmcore-acdc-datacollectionsvc")

        testFileA = File(lfn=makeUUID(), size=1024, events=1024)
        testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn=makeUUID(), size=1024, events=1024)
        testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileB.addRun(Run(1, 3, 4))
        testFileC = File(lfn=makeUUID(), size=1024, events=1024)
        testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileC.addRun(Run(1, 5, 6))
        testJobA = self.getMinimalJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        testJobA.addFile(testFileC)

        testFileD = File(lfn=makeUUID(), size=1024, events=1024)
        testFileD.setLocation(["cmssrm.fnal.gov"])
        testFileD.addRun(Run(2, 1, 2))
        testFileE = File(lfn=makeUUID(), size=1024, events=1024)
        testFileE.setLocation(["cmssrm.fnal.gov"])
        testFileE.addRun(Run(2, 3, 4))
        testJobB = self.getMinimalJob()
        testJobB.addFile(testFileD)
        testJobB.addFile(testFileE)

        testFileF = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/F"})
        testFileF.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileF.addRun(Run(3, 1, 2))
        testFileG = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/G"})
        testFileG.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileG.addRun(Run(3, 3, 4))
        testFileH = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         parents={"/some/parent/H"})
        testFileH.setLocation(
            ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileH.addRun(Run(3, 5, 6))
        testJobC = self.getMinimalJob()
        testJobC.addFile(testFileF)
        testJobC.addFile(testFileG)
        testJobC.addFile(testFileH)

        testFileI = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileI.addRun(Run(4, 1, 2))
        testFileJ = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileJ.addRun(Run(4, 3, 4))
        testFileK = File(lfn=makeUUID(), size=1024, events=1024, merged=True)
        testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileK.addRun(Run(4, 5, 6))
        testJobD = self.getMinimalJob()
        testJobD.addFile(testFileI)
        testJobD.addFile(testFileJ)
        testJobD.addFile(testFileK)

        dcs.failedJobs([testJobA, testJobB, testJobC, testJobD])
        chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize=5)

        self.assertEqual(
            len(chunks), 4,
            "Error: There should be four chunks: %s" % len(chunks))

        goldenMetaData = {
            1: {
                "lumis": 2,
                "locations": ["castor.cern.ch", "cmssrm.fnal.gov"],
                "events": 1024
            },
            2: {
                "lumis": 4,
                "locations": ["cmssrm.fnal.gov"],
                "events": 2048
            },
            3: {
                "lumis": 6,
                "locations":
                ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"],
                "events": 3072
            },
            5: {
                "lumis": 10,
                "locations": ["castor.cern.ch", "cmssrm.fnal.gov"],
                "events": 5120
            }
        }

        testFiles = [
            testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK
        ]
        lastFile = testFileA
        for testFile in testFiles:
            if lastFile["lfn"] < testFile["lfn"]:
                lastFile = testFile

        testFiles.remove(lastFile)

        goldenFiles = {
            1: [lastFile],
            2: [testFileD, testFileE],
            3: [testFileF, testFileG, testFileH],
            5: testFiles
        }

        for chunk in chunks:
            chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco",
                                             chunk["offset"], chunk["files"])

            self.assertEqual(chunkMetaData["files"], chunk["files"])
            self.assertEqual(chunkMetaData["lumis"], chunk["lumis"])
            self.assertEqual(chunkMetaData["events"], chunk["events"])
            self.assertEqual(chunkMetaData["locations"], chunk["locations"])

            self.assertTrue(chunk["files"] in goldenMetaData.keys(),
                            "Error: Extra chunk found.")
            self.assertEqual(chunk["lumis"],
                             goldenMetaData[chunk["files"]]["lumis"],
                             "Error: Lumis in chunk is wrong.")
            self.assertEqual(chunk["locations"],
                             goldenMetaData[chunk["files"]]["locations"],
                             "Error: Locations in chunk is wrong.")
            self.assertEqual(chunk["events"],
                             goldenMetaData[chunk["files"]]["events"],
                             "Error: Events in chunk is wrong.")
            del goldenMetaData[chunk["files"]]

            chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco",
                                           chunk["offset"], chunk["files"])

            self.assertTrue(chunk["files"] in goldenFiles.keys(),
                            "Error: Extra chunk found.")
            goldenChunkFiles = goldenFiles[chunk["files"]]
            self.assertEqual(len(chunkFiles), len(goldenChunkFiles))

            for chunkFile in chunkFiles:
                foundFile = None
                for goldenChunkFile in goldenChunkFiles:
                    if chunkFile["lfn"] == goldenChunkFile["lfn"]:
                        foundFile = goldenChunkFile
                        break

                self.assertIsNotNone(
                    foundFile, "Error: Missing chunk file: %s, %s" %
                    (chunkFiles, goldenChunkFiles))
                self.assertEqual(set(foundFile["parents"]),
                                 chunkFile["parents"],
                                 "Error: File parents should match.")
                self.assertEqual(foundFile["merged"], chunkFile["merged"],
                                 "Error: File merged status should match.")
                self.assertEqual(foundFile["locations"],
                                 chunkFile["locations"],
                                 "Error: File locations should match.")
                self.assertEqual(foundFile["events"], chunkFile["events"])
                self.assertEqual(foundFile["size"], chunkFile["size"])
                self.assertEqual(len(foundFile["runs"]),
                                 len(chunkFile["runs"]),
                                 "Error: Wrong number of runs.")
                for run in foundFile["runs"]:
                    runMatch = False
                    for chunkRun in chunkFile["runs"]:
                        if chunkRun.run == run.run and chunkRun.lumis == run.lumis:
                            runMatch = True
                            break

                    self.assertTrue(runMatch,
                                    "Error: Run information is wrong.")

            del goldenFiles[chunk["files"]]

        singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco")
        self.assertEqual(
            singleChunk, {
                "offset": 0,
                "files": 11,
                "events": 11264,
                "lumis": 22,
                "locations":
                {"castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"}
            }, "Error: Single chunk metadata is wrong")

        return
Exemplo n.º 47
0
    def testGetLumiWhitelist(self):
        """
        _testGetLumiWhitelist_

        Verify that the ACDC whitelist generation code works correctly.  We'll
        add jobs with the following lumi info:
          # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12]
          # Run 2, lumis [5, 6, 7], [10, 11, 12], [15]
          # Run 3, lumis [20]

        And should get out a whitelist that looks like this:
          {"1": [[1, 4], [6, 7], [9, 9], [11, 12]],
           "2": [[5, 7], [10, 12], [15, 15]],
           "3": [[20, 20]]}
        """
        dcs = DataCollectionService(url = self.testInit.couchUrl,
                                    database = "wmcore-acdc-datacollectionsvc")

        def getJob():
            job = Job()
            job["task"] = "/ACDCTest/reco"
            job["workflow"] = "ACDCTest"
            job["location"] = "cmssrm.fnal.gov"
            job["owner"] = "cmsdataops"
            job["group"] = "cmsdataops"
            return job

        testFileA = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileB.addRun(Run(1, 3))
        testJobA = getJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)

        testFileC = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileC.addRun(Run(1, 4, 6))
        testJobB = getJob()
        testJobB.addFile(testFileC)

        testFileD = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileD.addRun(Run(1, 7))
        testJobC = getJob()
        testJobC.addFile(testFileD)

        testFileE = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileE.addRun(Run(1, 11, 12))
        testJobD = getJob()
        testJobD.addFile(testFileE)

        testFileF = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileF.addRun(Run(2, 5, 6, 7))
        testJobE = getJob()
        testJobE.addFile(testFileF)

        testFileG = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileG.addRun(Run(2, 10, 11, 12))
        testJobF = getJob()
        testJobF.addFile(testFileG)

        testFileH = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileH.addRun(Run(2, 15))
        testJobG = getJob()
        testJobG.addFile(testFileH)

        testFileI = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileI.addRun(Run(3, 20))
        testJobH = getJob()
        testJobH.addFile(testFileI)

        testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileJ.addRun(Run(1, 9))
        testJobI = getJob()
        testJobI.addFile(testFileJ)

        dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE,
                        testJobF, testJobG, testJobH, testJobI])
        whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco")

        self.assertEqual(len(whiteList.keys()), 3,
                         "Error: There should be 3 runs.")
        self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]],
                         "Error: Whitelist for run 1 is wrong.")
        self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]],
                         "Error: Whitelist for run 2 is wrong.")
        self.assertEqual(whiteList["3"], [[20, 20]],
                         "Error: Whitelist for run 3 is wrong.")
        return
Exemplo n.º 48
0
    def testChunking(self):
        """
        _testChunking_

        Insert a workload and files that have several distinct sets of
        locations.  Verify that the chunks are created correctly and that they
        only groups files that have the same set of locations.  Also verify that
        the chunks are pulled out of ACDC correctly.
        """
        dcs = DataCollectionService(url = self.testInit.couchUrl,
                                    database = "wmcore-acdc-datacollectionsvc")

        def getJob():
            job = Job()
            job["task"] = "/ACDCTest/reco"
            job["workflow"] = "ACDCTest"
            job["location"] = "cmssrm.fnal.gov"
            job["owner"] = "cmsdataops"
            job["group"] = "cmsdataops"
            return job

        testFileA = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileB.addRun(Run(1, 3, 4))
        testFileC = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileC.addRun(Run(1, 5, 6))
        testJobA = getJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)
        testJobA.addFile(testFileC)

        testFileD = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileD.setLocation(["cmssrm.fnal.gov"])
        testFileD.addRun(Run(2, 1, 2))
        testFileE = File(lfn = makeUUID(), size = 1024, events = 1024)
        testFileE.setLocation(["cmssrm.fnal.gov"])
        testFileE.addRun(Run(2, 3, 4))
        testJobB = getJob()
        testJobB.addFile(testFileD)
        testJobB.addFile(testFileE)

        testFileF = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/F"]))
        testFileF.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileF.addRun(Run(3, 1, 2))
        testFileG = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/G"]))
        testFileG.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"] )
        testFileG.addRun(Run(3, 3, 4))
        testFileH = File(lfn = makeUUID(), size = 1024, events = 1024,
                         parents = set(["/some/parent/H"]))
        testFileH.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"])
        testFileH.addRun(Run(3, 5, 6))
        testJobC = getJob()
        testJobC.addFile(testFileF)
        testJobC.addFile(testFileG)
        testJobC.addFile(testFileH)

        testFileI = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileI.addRun(Run(4, 1, 2))
        testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"] )
        testFileJ.addRun(Run(4, 3, 4))
        testFileK = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True)
        testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"])
        testFileK.addRun(Run(4, 5, 6))
        testJobD = getJob()
        testJobD.addFile(testFileI)
        testJobD.addFile(testFileJ)
        testJobD.addFile(testFileK)

        dcs.failedJobs([testJobA, testJobB, testJobC, testJobD])
        chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco",
                                  chunkSize = 5)

        self.assertEqual(len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks))

        goldenMetaData = {1: {"lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024},
                          2: {"lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048},
                          3: {"lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072},
                          5: {"lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120}}

        testFiles =[testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK]
        lastFile = testFileA
        for testFile in testFiles:
            if lastFile["lfn"] < testFile["lfn"]:
                lastFile = testFile

        testFiles.remove(lastFile)

        goldenFiles = {1: [lastFile],
                       2: [testFileD, testFileE],
                       3: [testFileF, testFileG, testFileH],
                       5: testFiles}

        for chunk in chunks:
            chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco",
                                             chunk["offset"], chunk["files"])

            self.assertEqual(chunkMetaData["files"], chunk["files"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["lumis"], chunk["lumis"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["events"], chunk["events"],
                             "Error: Metadata doesn't match.")
            self.assertEqual(chunkMetaData["locations"], chunk["locations"],
                             "Error: Metadata doesn't match.")

            self.assertTrue(chunk["files"] in goldenMetaData.keys(),
                            "Error: Extra chunk found.")
            self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"],
                             "Error: Lumis in chunk is wrong.")
            self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"],
                             "Error: Locations in chunk is wrong.")
            self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"],
                             "Error: Events in chunk is wrong.")
            del goldenMetaData[chunk["files"]]

            chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco",
                                           chunk["offset"], chunk["files"])

            self.assertTrue(chunk["files"] in goldenFiles.keys(),
                            "Error: Extra chunk found.")
            goldenChunkFiles = goldenFiles[chunk["files"]]
            self.assertEqual(len(chunkFiles), len(goldenChunkFiles))

            for chunkFile in chunkFiles:
                foundFile = None
                for goldenChunkFile in goldenChunkFiles:
                    if chunkFile["lfn"] == goldenChunkFile["lfn"]:
                        foundFile = goldenChunkFile
                        break

                self.assertTrue(foundFile != None,
                                "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles))
                self.assertEqual(foundFile["parents"], chunkFile["parents"],
                                 "Error: File parents should match.")
                self.assertEqual(foundFile["merged"], chunkFile["merged"],
                                 "Error: File merged status should match.")
                self.assertEqual(foundFile["locations"], chunkFile["locations"],
                                 "Error: File locations should match.")
                self.assertEqual(foundFile["events"], chunkFile["events"],
                                 "Error: File locations should match: %s" % chunk["files"])
                self.assertEqual(foundFile["size"], chunkFile["size"],
                                 "Error: File locations should match.")
                self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]),
                                 "Error: Wrong number of runs.")
                for run in foundFile["runs"]:
                    runMatch = False
                    for chunkRun in chunkFile["runs"]:
                        if chunkRun.run == run.run and chunkRun.lumis == run.lumis:
                            runMatch = True
                            break

                    self.assertTrue(runMatch, "Error: Run information is wrong.")

            del goldenFiles[chunk["files"]]

        singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco")
        self.assertEqual(singleChunk, {"offset" : 0,
                                       "files" : 11,
                                       "events" : 11264,
                                       "lumis" : 22,
                                       "locations" : set(["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"])},
                         "Error: Single chunk metadata is wrong")

        return
Exemplo n.º 49
0
    def testGetLumiWhitelist(self):
        """
        _testGetLumiWhitelist_

        Verify that the ACDC whitelist generation code works correctly.  We'll
        add jobs with the following lumi info:
          # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12]
          # Run 2, lumis [5, 6, 7], [10, 11, 12], [15]
          # Run 3, lumis [20]

        And should get out a whitelist that looks like this:
          {"1": [[1, 4], [6, 7], [9, 9], [11, 12]],
           "2": [[5, 7], [10, 12], [15, 15]],
           "3": [[20, 20]]}
        """
        dcs = DataCollectionService(url=self.testInit.couchUrl,
                                    database="wmcore-acdc-datacollectionsvc")

        testFileA = File(lfn=makeUUID(), size=1024, events=1024)
        testFileA.addRun(Run(1, 1, 2))
        testFileB = File(lfn=makeUUID(), size=1024, events=1024)
        testFileB.addRun(Run(1, 3))
        testJobA = self.getMinimalJob()
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)

        testFileC = File(lfn=makeUUID(), size=1024, events=1024)
        testFileC.addRun(Run(1, 4, 6))
        testJobB = self.getMinimalJob()
        testJobB.addFile(testFileC)

        testFileD = File(lfn=makeUUID(), size=1024, events=1024)
        testFileD.addRun(Run(1, 7))
        testJobC = self.getMinimalJob()
        testJobC.addFile(testFileD)

        testFileE = File(lfn=makeUUID(), size=1024, events=1024)
        testFileE.addRun(Run(1, 11, 12))
        testJobD = self.getMinimalJob()
        testJobD.addFile(testFileE)

        testFileF = File(lfn=makeUUID(), size=1024, events=1024)
        testFileF.addRun(Run(2, 5, 6, 7))
        testJobE = self.getMinimalJob()
        testJobE.addFile(testFileF)

        testFileG = File(lfn=makeUUID(), size=1024, events=1024)
        testFileG.addRun(Run(2, 10, 11, 12))
        testJobF = self.getMinimalJob()
        testJobF.addFile(testFileG)

        testFileH = File(lfn=makeUUID(), size=1024, events=1024)
        testFileH.addRun(Run(2, 15))
        testJobG = self.getMinimalJob()
        testJobG.addFile(testFileH)

        testFileI = File(lfn=makeUUID(), size=1024, events=1024)
        testFileI.addRun(Run(3, 20))
        testJobH = self.getMinimalJob()
        testJobH.addFile(testFileI)

        testFileJ = File(lfn=makeUUID(), size=1024, events=1024)
        testFileJ.addRun(Run(1, 9))
        testJobI = self.getMinimalJob()
        testJobI.addFile(testFileJ)

        dcs.failedJobs([
            testJobA, testJobB, testJobC, testJobD, testJobE, testJobF,
            testJobG, testJobH, testJobI
        ])
        whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco")

        self.assertEqual(len(whiteList.keys()), 3,
                         "Error: There should be 3 runs.")
        self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]],
                         "Error: Whitelist for run 1 is wrong.")
        self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]],
                         "Error: Whitelist for run 2 is wrong.")
        self.assertEqual(whiteList["3"], [[20, 20]],
                         "Error: Whitelist for run 3 is wrong.")

        correctLumiList = LumiList(
            compactList={
                "1": [[1, 4], [6, 7], [9, 9], [11, 12]],
                "2": [[5, 7], [10, 12], [15, 15]],
                "3": [[20, 20]]
            })
        testLumiList = dcs.getLumilistWhitelist("ACDCTest", "/ACDCTest/reco")
        self.assertEqual(correctLumiList.getCMSSWString(),
                         testLumiList.getCMSSWString())

        return
Exemplo n.º 50
0
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                        logger=myThread.logger,
                                        dbinterface=myThread.dbi)

        self.getOutputMapAction = self.daofactory(
            classname="Jobs.GetOutputMap")
        self.bulkAddToFilesetAction = self.daofactory(
            classname="Fileset.BulkAddByLFN")
        self.bulkParentageAction = self.daofactory(
            classname="Files.AddBulkParentage")
        self.getJobTypeAction = self.daofactory(classname="Jobs.GetType")
        self.getParentInfoAction = self.daofactory(
            classname="Files.GetParentInfo")
        self.setParentageByJob = self.daofactory(
            classname="Files.SetParentageByJob")
        self.setParentageByMergeJob = self.daofactory(
            classname="Files.SetParentageByMergeJob")
        self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi")
        self.setFileLocation = self.daofactory(
            classname="Files.SetLocationByLFN")
        self.setFileAddChecksum = self.daofactory(
            classname="Files.AddChecksumByLFN")
        self.addFileAction = self.daofactory(classname="Files.Add")
        self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput")
        self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk")
        self.getWorkflowSpec = self.daofactory(
            classname="Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID")
        self.getFullJobInfo = self.daofactory(
            classname="Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction = self.daofactory(
            classname="Jobs.GetFWJRTaskName")
        self.pnn_to_psn = self.daofactory(
            classname="Locations.GetPNNtoPSNMapping").execute()

        self.dbsStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction = self.dbsDaoFactory(
            classname="DBSBufferFiles.GetChildren")
        self.dbsCreateFiles = self.dbsDaoFactory(
            classname="DBSBufferFiles.Add")
        self.dbsSetLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddLocation")
        self.dbsSetChecksum = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi = self.dbsDaoFactory(
            classname="DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow")

        self.dbsLFNHeritage = self.dbsDaoFactory(
            classname="DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant,
                                       'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco
        self.maxAllowedRepackOutputSize = getattr(
            config.JobAccountant, 'maxAllowedRepackOutputSize',
            12 * 1024 * 1024 * 1024)

        # ACDC service
        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL,
                                          appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate = []
        self.wmbsFilesToBuild = []
        self.wmbsMergeFilesToBuild = []
        self.fileLocation = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave = []
        self.listOfJobsToFail = []
        self.filesetAssoc = []
        self.parentageBinds = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID = collections.deque(maxlen=1000)
        self.datasetAlgoPaths = collections.deque(maxlen=1000)
        self.dbsLocations = set()
        self.workflowIDs = collections.deque(maxlen=1000)
        self.workflowPaths = collections.deque(maxlen=1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()

        return
Exemplo n.º 51
0
    def testC_ACDCTest(self):
        """
        _ACDCTest_

        Test whether we can get a goodRunList out of ACDC
        and process it correctly.
        """
        workload = self.createTestWorkload()
        dcs = DataCollectionService(url=self.testInit.couchUrl, database=self.testInit.couchDbName)

        testFileA = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileA.addRun(Run(1, 1, 2))
        testFileA.create()
        testFileB = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileB.addRun(Run(1, 3))
        testFileB.create()
        testJobA = getJob(workload)
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)

        testFileC = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileC.addRun(Run(1, 4, 6))
        testFileC.create()
        testJobB = getJob(workload)
        testJobB.addFile(testFileC)

        testFileD = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileD.addRun(Run(1, 7))
        testFileD.create()
        testJobC = getJob(workload)
        testJobC.addFile(testFileD)

        testFileE = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileE.addRun(Run(1, 11, 12))
        testFileE.create()
        testJobD = getJob(workload)
        testJobD.addFile(testFileE)

        testFileF = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileF.addRun(Run(2, 5, 6, 7))
        testFileF.create()
        testJobE = getJob(workload)
        testJobE.addFile(testFileF)

        testFileG = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileG.addRun(Run(2, 10, 11, 12))
        testFileG.create()
        testJobF = getJob(workload)
        testJobF.addFile(testFileG)

        testFileH = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileH.addRun(Run(2, 15))
        testFileH.create()
        testJobG = getJob(workload)
        testJobG.addFile(testFileH)

        testFileI = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileI.addRun(Run(3, 20))
        testFileI.create()
        testJobH = getJob(workload)
        testJobH.addFile(testFileI)

        testFileJ = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk")
        testFileJ.addRun(Run(1, 9))
        testFileJ.create()
        testJobI = getJob(workload)
        testJobI.addFile(testFileJ)

        # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE,
        #                testJobF, testJobG, testJobH, testJobI])

        dcs.failedJobs([testJobA, testJobD, testJobH])

        baseName = makeUUID()

        testFileset = Fileset(name=baseName)
        testFileset.create()
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)
        testFileset.addFile(testFileH)
        testFileset.addFile(testFileI)
        testFileset.addFile(testFileJ)
        testFileset.commit()

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="LumiBased",
                                        type="Processing")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)

        jobGroups = jobFactory(lumis_per_job=100,
                               halt_job_on_file_boundaries=False,
                               splitOnRun=True,
                               collectionName=workload.name(),
                               filesetName=workload.getTask("reco").getPathName(),
                               owner="evansde77",
                               group="DMWM",
                               couchURL=self.testInit.couchUrl,
                               couchDB=self.testInit.couchDbName,
                               performance=self.performanceParams)

        self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(), {1: [[1, 2], [3, 3], [11, 12]]})
        self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(), {3: [[20, 20]]})

        return
Exemplo n.º 52
0
    def testC_ACDCTest(self):
        """
        _ACDCTest_

        Test whether we can get a goodRunList out of ACDC
        and process it correctly.
        """
        workload = self.createTestWorkload()
        dcs = DataCollectionService(url=self.testInit.couchUrl,
                                    database=self.testInit.couchDbName)

        testFileA = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileA.addRun(Run(1, 1, 2))
        testFileA.create()
        testFileB = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileB.addRun(Run(1, 3))
        testFileB.create()
        testJobA = getJob(workload)
        testJobA.addFile(testFileA)
        testJobA.addFile(testFileB)

        testFileC = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileC.addRun(Run(1, 4, 6))
        testFileC.create()
        testJobB = getJob(workload)
        testJobB.addFile(testFileC)

        testFileD = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileD.addRun(Run(1, 7))
        testFileD.create()
        testJobC = getJob(workload)
        testJobC.addFile(testFileD)

        testFileE = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileE.addRun(Run(1, 11, 12))
        testFileE.create()
        testJobD = getJob(workload)
        testJobD.addFile(testFileE)

        testFileF = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileF.addRun(Run(2, 5, 6, 7))
        testFileF.create()
        testJobE = getJob(workload)
        testJobE.addFile(testFileF)

        testFileG = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileG.addRun(Run(2, 10, 11, 12))
        testFileG.create()
        testJobF = getJob(workload)
        testJobF.addFile(testFileG)

        testFileH = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileH.addRun(Run(2, 15))
        testFileH.create()
        testJobG = getJob(workload)
        testJobG.addFile(testFileH)

        testFileI = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileI.addRun(Run(3, 20))
        testFileI.create()
        testJobH = getJob(workload)
        testJobH.addFile(testFileI)

        testFileJ = File(lfn=makeUUID(),
                         size=1024,
                         events=1024,
                         locations="T1_US_FNAL_Disk")
        testFileJ.addRun(Run(1, 9))
        testFileJ.create()
        testJobI = getJob(workload)
        testJobI.addFile(testFileJ)

        # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE,
        #                testJobF, testJobG, testJobH, testJobI])

        dcs.failedJobs([testJobA, testJobD, testJobH])

        baseName = makeUUID()

        testFileset = Fileset(name=baseName)
        testFileset.create()
        testFileset.addFile(testFileA)
        testFileset.addFile(testFileB)
        testFileset.addFile(testFileC)
        testFileset.addFile(testFileD)
        testFileset.addFile(testFileE)
        testFileset.addFile(testFileF)
        testFileset.addFile(testFileG)
        testFileset.addFile(testFileH)
        testFileset.addFile(testFileI)
        testFileset.addFile(testFileJ)
        testFileset.commit()

        testSubscription = Subscription(fileset=testFileset,
                                        workflow=self.testWorkflow,
                                        split_algo="LumiBased",
                                        type="Processing")
        testSubscription.create()

        splitter = SplitterFactory()
        jobFactory = splitter(package="WMCore.WMBS",
                              subscription=testSubscription)

        jobGroups = jobFactory(
            lumis_per_job=100,
            halt_job_on_file_boundaries=False,
            splitOnRun=True,
            collectionName=workload.name(),
            filesetName=workload.getTask("reco").getPathName(),
            owner="evansde77",
            group="DMWM",
            couchURL=self.testInit.couchUrl,
            couchDB=self.testInit.couchDbName,
            performance=self.performanceParams)

        self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(),
                         {1: [[1, 2], [3, 3], [11, 12]]})
        self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(),
                         {3: [[20, 20]]})

        return
Exemplo n.º 53
0
class ErrorHandlerTest(EmulatedUnitTestCase):
    """
    TestCase for TestErrorHandler module
    """
    def setUp(self):
        """
        setup for test.
        """
        super(ErrorHandlerTest, self).setUp()
        myThread = threading.currentThread()

        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection()
        self.testInit.setSchema(customModules=["WMCore.WMBS"],
                                useDefault=False)
        self.testInit.setupCouch("errorhandler_t", "GroupUser", "ACDC")
        self.testInit.setupCouch("errorhandler_t_jd/jobs", "JobDump")
        self.testInit.setupCouch("errorhandler_t_jd/fwjrs", "FWJRDump")

        self.daofactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.getJobs = self.daofactory(classname="Jobs.GetAllJobs")
        self.setJobTime = self.daofactory(classname="Jobs.SetStateTime")
        locationAction = self.daofactory(classname="Locations.New")
        locationAction.execute(siteName="malpaquet", pnn="T2_CH_CERN")
        self.testDir = self.testInit.generateWorkDir()
        self.configFile = EmulatorSetup.setupWMAgentConfig()
        self.nJobs = 10

        self.dataCS = DataCollectionService(url=self.testInit.couchUrl,
                                            database="errorhandler_t")

        return

    def tearDown(self):
        """
        Database deletion
        """
        self.testInit.clearDatabase()
        self.testInit.delWorkDir()
        self.testInit.tearDownCouch()
        EmulatorSetup.deleteConfig(self.configFile)
        return

    def getConfig(self):
        """
        _getConfig_

        """
        config = self.testInit.getConfiguration()
        self.testInit.generateWorkDir(config)

        # First the general stuff
        config.section_("General")
        config.General.workDir = os.getenv("TESTDIR", self.testDir)
        config.section_("CoreDatabase")
        config.CoreDatabase.connectUrl = os.getenv("DATABASE")
        config.CoreDatabase.socket = os.getenv("DBSOCK")

        config.component_("ErrorHandler")
        # The log level of the component.
        config.ErrorHandler.logLevel = 'INFO'
        # The namespace of the component
        config.ErrorHandler.namespace = 'WMComponent.ErrorHandler.ErrorHandler'
        # maximum number of threads we want to deal
        # config.ErrorHandler.maxThreads = 30
        # with messages per pool.
        config.ErrorHandler.maxProcessSize = 30
        config.ErrorHandler.readFWJR = True
        # maximum number of retries we want for job
        config.ErrorHandler.maxRetries = 5
        # The poll interval at which to look for failed jobs
        config.ErrorHandler.pollInterval = 60
        # this will be overwritten in some unittests
        # JobStateMachine
        config.component_('JobStateMachine')
        config.JobStateMachine.couchurl = os.getenv('COUCHURL', None)
        config.JobStateMachine.couchDBName = "errorhandler_t_jd"

        config.section_('ACDC')
        config.ACDC.couchurl = self.testInit.couchUrl
        config.ACDC.database = "errorhandler_t"

        return config

    def createWorkload(self, workloadName='Test'):
        """
        _createTestWorkload_

        Creates a test workload for us to run on, hold the basic necessities.
        """

        workload = testWorkload(workloadName)

        # Add RequestManager stuff
        workload.data.request.section_('schema')
        workload.data.request.schema.Requestor = 'nobody'
        workload.data.request.schema.Group = 'testers'

        taskMaker = TaskMaker(workload,
                              os.path.join(self.testDir, 'workloadTest'))
        taskMaker.skipSubscription = True
        taskMaker.processWorkload()

        return workload

    def createTestJobGroup(self,
                           nJobs=10,
                           retry_count=1,
                           workloadPath='test',
                           fwjrPath=None,
                           workloadName=makeUUID(),
                           fileModifier=''):
        """
        Creates a group of several jobs
        """

        myThread = threading.currentThread()
        myThread.transaction.begin()
        testWorkflow = Workflow(spec=workloadPath,
                                owner="cmsdataops",
                                group="cmsdataops",
                                name=workloadName,
                                task="/TestWorkload/ReReco")
        testWorkflow.create()

        testWMBSFileset = Fileset(name="TestFileset")
        testWMBSFileset.create()

        testSubscription = Subscription(fileset=testWMBSFileset,
                                        workflow=testWorkflow)
        testSubscription.create()

        testJobGroup = JobGroup(subscription=testSubscription)
        testJobGroup.create()

        testFile0 = File(lfn="/this/is/a/parent%s" % fileModifier,
                         size=1024,
                         events=10)
        testFile0.addRun(Run(10, *[12312]))
        testFile0.setLocation('T2_CH_CERN')

        testFileA = File(lfn="/this/is/a/lfnA%s" % fileModifier,
                         size=1024,
                         events=10,
                         first_event=88,
                         merged=False)
        testFileA.addRun(Run(10, *[12312, 12313]))
        testFileA.setLocation('T2_CH_CERN')

        testFileB = File(lfn="/this/is/a/lfnB%s" % fileModifier,
                         size=1024,
                         events=10,
                         first_event=88,
                         merged=False)
        testFileB.addRun(Run(10, *[12314, 12315, 12316]))
        testFileB.setLocation('T2_CH_CERN')

        testFile0.create()
        testFileA.create()
        testFileB.create()

        testFileA.addParent(lfn="/this/is/a/parent%s" % fileModifier)
        testFileB.addParent(lfn="/this/is/a/parent%s" % fileModifier)

        for i in range(0, nJobs):
            testJob = Job(name=makeUUID())
            testJob['retry_count'] = retry_count
            testJob['retry_max'] = 10
            testJob['mask'].addRunAndLumis(run=10, lumis=[12312])
            testJob['mask'].addRunAndLumis(run=10, lumis=[12314, 12316])
            testJob['cache_dir'] = os.path.join(self.testDir, testJob['name'])
            testJob['fwjr_path'] = fwjrPath
            os.mkdir(testJob['cache_dir'])
            testJobGroup.add(testJob)
            testJob.create(group=testJobGroup)
            testJob.addFile(testFileA)
            testJob.addFile(testFileB)
            testJob.save()

        testJobGroup.commit()

        testSubscription.acquireFiles(files=[testFileA, testFileB])
        testSubscription.save()
        myThread.transaction.commit()

        return testJobGroup

    def testA_Create(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testCreate()

        Mimics creation of component and test jobs failed in create stage.
        """
        njobs = 4
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        #        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
        testJobGroup = self.createTestJobGroup(nJobs=njobs,
                                               workloadPath=workloadPath,
                                               workloadName=workloadName)
        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'createfailed', 'created')

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), njobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        # These should go directly to exhausted
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), njobs)

        # Check that it showed up in ACDC
        collection = self.dataCS.getDataCollection(workloadName)

        # Now look at what's inside
        self.assertTrue(len(collection['filesets']) > 0)
        for fileset in collection["filesets"]:
            counter = 0
            for f in fileset.listFiles():
                counter += 1
                self.assertTrue(
                    f['lfn'] in ["/this/is/a/lfnA", "/this/is/a/lfnB"])
                self.assertEqual(f['events'], 10)
                self.assertEqual(f['size'], 1024)
                self.assertEqual(f['parents'], [u'/this/is/a/parent'])
                self.assertTrue(f['runs'][0]['run_number'] == 10)
                if f['lfn'] == "/this/is/a/lfnA":
                    self.assertItemsEqual(f['runs'][0]['lumis'], [12312])
                elif f['lfn'] == "/this/is/a/lfnB":
                    self.assertItemsEqual(f['runs'][0]['lumis'],
                                          [12314, 12315, 12316])
                else:
                    self.assertFail("File name is not known: %s" % f['lfn'])
                self.assertEqual(f['merged'], 0)
                self.assertEqual(f['first_event'], 88)
            self.assertEqual(
                counter,
                njobs * 2)  # each job has 2 files (thus 4 times duplicate)
        return

    def testB_Submit(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t:testSubmit()

        Mimics creation of component and test jobs failed in submit stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'submitfailed', 'created')

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='SubmitFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='SubmitCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return

    def testC_Jobs(self):
        """
        WMComponent_t.ErrorHandler_t.ErrorHandler_t.testJobs()

        Mimics creation of component and test jobs failed in execute stage.
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), self.nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        return

    def testD_Exhausted(self):
        """
        _testExhausted_

        Test that the system can exhaust jobs correctly
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               retry_count=5,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        config.ErrorHandler.maxRetries = 1
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testSubscription = Subscription(id=1)  # You should only have one
        testSubscription.load()
        testSubscription.loadData()

        # Do we have files to start with?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 2)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.algorithm(None)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        # Did we fail the files?
        self.assertEqual(len(testSubscription.filesOfStatus("Acquired")), 0)
        self.assertEqual(len(testSubscription.filesOfStatus("Failed")), 2)

    def testE_FailJobs(self):
        """
        _FailJobs_

        Test our ability to fail jobs based on the information in the FWJR
        """
        workloadName = 'TestWorkload'

        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        fwjrPath = os.path.join(WMCore.WMBase.getTestBase(),
                                "WMComponent_t/JobAccountant_t",
                                "fwjrs/badBackfillJobReport.pkl")

        testJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                               workloadPath=workloadPath,
                                               fwjrPath=fwjrPath)

        badJobGroup = self.createTestJobGroup(nJobs=self.nJobs,
                                              workloadPath=workloadPath,
                                              fwjrPath=None,
                                              fileModifier='bad')

        config = self.getConfig()
        config.ErrorHandler.readFWJR = True
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')
        changer.propagate(badJobGroup.jobs, 'created', 'new')
        changer.propagate(badJobGroup.jobs, 'executing', 'created')
        changer.propagate(badJobGroup.jobs, 'complete', 'executing')
        changer.propagate(badJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)
        testErrorHandler.exitCodesNoRetry = [8020]
        testErrorHandler.algorithm(None)

        # This should exhaust all jobs due to exit code
        # Except those with no fwjr
        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.maxFailTime = -10
        testErrorHandler2 = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler2.reqAuxDB = None

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler2.algorithm(None)

        # This should exhaust all jobs due to timeout
        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)
        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), self.nJobs)
        idList = self.getJobs.execute(state='Exhausted')
        self.assertEqual(len(idList), self.nJobs)

        config.ErrorHandler.maxFailTime = 24 * 3600
        config.ErrorHandler.passExitCodes = [8020]
        testErrorHandler3 = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler3.reqAuxDB = None

        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        testErrorHandler3.algorithm(None)

        # This should pass all jobs due to exit code
        idList = self.getJobs.execute(state='Created')
        self.assertEqual(len(idList), self.nJobs)

        return

    @attr('integration')
    def testZ_Profile(self):
        """
        _testProfile_

        Do a full profile of the poller
        """

        nJobs = 100
        workloadName = 'TestWorkload'
        self.createWorkload(workloadName=workloadName)
        workloadPath = os.path.join(self.testDir, 'workloadTest', workloadName,
                                    'WMSandbox', 'WMWorkload.pkl')

        testJobGroup = self.createTestJobGroup(nJobs=nJobs,
                                               workloadPath=workloadPath)

        config = self.getConfig()
        changer = ChangeState(config)
        changer.propagate(testJobGroup.jobs, 'created', 'new')
        changer.propagate(testJobGroup.jobs, 'executing', 'created')
        changer.propagate(testJobGroup.jobs, 'complete', 'executing')
        changer.propagate(testJobGroup.jobs, 'jobfailed', 'complete')

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), nJobs)

        testErrorHandler = ErrorHandlerPoller(config)
        # set reqAuxDB None for the test,
        testErrorHandler.reqAuxDB = None
        testErrorHandler.setup(None)

        startTime = time.time()
        cProfile.runctx("testErrorHandler.algorithm()",
                        globals(),
                        locals(),
                        filename="profStats.stat")
        stopTime = time.time()

        idList = self.getJobs.execute(state='CreateFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobFailed')
        self.assertEqual(len(idList), 0)

        idList = self.getJobs.execute(state='JobCooloff')
        self.assertEqual(len(idList), nJobs)

        print("Took %f seconds to run polling algo" % (stopTime - startTime))

        p = pstats.Stats('profStats.stat')
        p.sort_stats('cumulative')
        p.print_stats(0.2)

        return
Exemplo n.º 54
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        Split files into a number of lumis per job
        Allow a flag to determine if we split files between jobs
        """

        myThread = threading.currentThread()

        lumisPerJob = int(kwargs.get('lumis_per_job', 1))
        splitOnFile = bool(kwargs.get('halt_job_on_file_boundaries', True))
        ignoreACDC = bool(kwargs.get('ignore_acdc_except', False))
        collectionName = kwargs.get('collectionName', None)
        splitOnRun = kwargs.get('splitOnRun', True)
        getParents = kwargs.get('include_parents', False)
        runWhitelist = kwargs.get('runWhitelist', [])
        runs = kwargs.get('runs', None)
        lumis = kwargs.get('lumis', None)
        deterministicPileup = kwargs.get('deterministicPileup', False)
        eventsPerLumiInDataset = 0

        if deterministicPileup and self.package == 'WMCore.WMBS':
            getJobNumber = self.daoFactory(
                classname="Jobs.GetNumberOfJobsPerWorkflow")
            jobNumber = getJobNumber.execute(
                workflow=self.subscription.getWorkflow().id)
            self.nJobs = jobNumber

        timePerEvent, sizePerEvent, memoryRequirement = \
                    self.getPerformanceParameters(kwargs.get('performance', {}))

        goodRunList = {}
        if runs and lumis:
            goodRunList = buildLumiMask(runs, lumis)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner = kwargs.get('owner')
                group = kwargs.get('group')

                logging.info('Creating jobs for ACDC fileset %s' % filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                goodRunList = dcs.getLumiWhitelist(collectionName, filesetName,
                                                   owner, group)
            except Exception, ex:
                msg = "Exception while trying to load goodRunList\n"
                if ignoreACDC:
                    msg += "Ditching goodRunList\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    goodRunList = {}
                else:
                    msg += "Refusing to create any jobs.\n"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logging.error(msg)
                    return
Exemplo n.º 55
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package = "WMCore.WMBS",
                                     logger = myThread.logger,
                                     dbinterface = myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries     = self.config.ErrorHandler.maxRetries
        if type(self.maxRetries) != dict:
            self.maxRetries = {'default' : self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.exitCodes      = getattr(self.config.ErrorHandler, 'failureExitCodes', [])
        self.maxFailTime    = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR       = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes      = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs    = self.daoFactory(classname = "Jobs.GetAllJobs")
        self.idLoad     = self.daoFactory(classname = "Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        # initialize the alert framework (if available - config.Alert present)
        #    self.sendAlert will be then be available
        self.initAlerts(compName = "ErrorHandler")

        # Some exit codes imply an immediate failure, non-configurable
        self.exitCodes.extend(WMJobPermanentSystemErrors)

        return

    def setup(self, parameters = None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state))
        retrydoneJobs = []
        cooloffJobs = []
        passJobs    = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                self.sendAlert(4, msg = msg)
                logging.debug("JobInfo: %s" % job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs, 'retrydone',
                                       '%sfailed' % state, updatesummary = True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs, '%scooloff' % state,
                                       '%sfailed' % state, updatesummary = True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs" % len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []
        for job in jobList:
            report     = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath))
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                if startTime == None or stopTime == None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i" % job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.exitCodes]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    self.sendAlert(4, msg = msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception, ex:
                logging.warning("Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs
Exemplo n.º 56
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        An event base splitting algorithm.  All available files are split into a
        set number of events per job.
        """
        eventsPerJob = int(kwargs.get("events_per_job", 100))
        eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob))
        getParents = kwargs.get("include_parents", False)
        lheInput = kwargs.get("lheInputFiles", False)
        collectionName = kwargs.get('collectionName', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
            self.getPerformanceParameters(kwargs.get('performance', {}))
        acdcFileList = []
        deterministicPileup = kwargs.get('deterministicPileup', False)

        if deterministicPileup and self.package == 'WMCore.WMBS':
            getJobNumber = self.daoFactory(classname="Jobs.GetNumberOfJobsPerWorkflow")
            self.nJobs = getJobNumber.execute(workflow=self.subscription.getWorkflow().id)
            logging.info('Creating %d jobs in DeterministicPileup mode', self.nJobs)

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                owner = kwargs.get('owner')
                group = kwargs.get('group')
                logging.info('Creating jobs for ACDC fileset %s', filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName, owner, group)
            except Exception as ex:
                msg = "Exception while trying to load goodRunList\n"
                msg += "Refusing to create any jobs.\n"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logging.error(msg)
                return

        totalJobs = 0

        locationDict = self.sortByLocation()
        for location in locationDict:
            self.newGroup()
            fileList = locationDict[location]
            getRunLumiInformation = False
            for f in fileList:
                if f['lfn'].startswith("MCFakeFile"):
                    # We have one MCFakeFile, then it needs run information
                    getRunLumiInformation = True
                    break
            if getRunLumiInformation:
                if self.package == 'WMCore.WMBS':
                    loadRunLumi = self.daoFactory(classname="Files.GetBulkRunLumi")
                    fileLumis = loadRunLumi.execute(files=fileList)
                    for f in fileList:
                        lumiDict = fileLumis.get(f['id'], {})
                        for run in lumiDict.keys():
                            f.addRun(run=Run(run, *lumiDict[run]))

            for f in fileList:
                currentEvent = f['first_event']
                eventsInFile = f['events']
                runs = list(f['runs'])
                # We got the runs, clean the file.
                f['runs'] = set()

                if getParents:
                    parentLFNs = self.findParent(lfn=f['lfn'])
                    for lfn in parentLFNs:
                        parent = File(lfn=lfn)
                        f['parents'].add(parent)

                if acdcFileList:
                    if f['lfn'] in [x['lfn'] for x in acdcFileList]:
                        totalJobs = self.createACDCJobs(f, acdcFileList, timePerEvent,
                                                        sizePerEvent, memoryRequirement,
                                                        lheInput, eventsPerJob, eventsPerLumi,
                                                        deterministicPileup, totalJobs)
                    continue
                if not f['lfn'].startswith("MCFakeFile"):
                    # Very very uncommon, but it has real input dataset
                    if eventsInFile >= eventsPerJob:
                        while currentEvent < eventsInFile:
                            self.newJob(name=self.getJobName(length=totalJobs))
                            self.currentJob.addFile(f)
                            if eventsPerJob + currentEvent < eventsInFile:
                                jobTime = eventsPerJob * timePerEvent
                                diskRequired = eventsPerJob * sizePerEvent
                                self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob, currentEvent)
                            else:
                                jobTime = (eventsInFile - currentEvent) * timePerEvent
                                diskRequired = (eventsInFile - currentEvent) * sizePerEvent
                                self.currentJob["mask"].setMaxAndSkipEvents(None,
                                                                            currentEvent)
                            self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                                 memory=memoryRequirement,
                                                                 disk=diskRequired)
                            if deterministicPileup:
                                self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)

                            logging.debug("Job created for real input with %s", self.currentJob)
                            currentEvent += eventsPerJob
                            totalJobs += 1
                    else:
                        self.newJob(name=self.getJobName(length=totalJobs))
                        self.currentJob.addFile(f)
                        jobTime = eventsInFile * timePerEvent
                        diskRequired = eventsInFile * sizePerEvent
                        self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                             memory=memoryRequirement,
                                                             disk=diskRequired)
                        if deterministicPileup:
                            self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)
                        logging.debug("Last job created for real input with %s", self.currentJob)
                        totalJobs += 1
                else:
                    # This assumes there's only one run which is the case for MC
                    lumis = runs[0].lumis
                    (firstLumi, lastLumi) = (min(lumis), max(lumis))
                    currentLumi = firstLumi
                    totalEvents = 0
                    if eventsInFile >= eventsPerJob:
                        while totalEvents < eventsInFile:
                            self.newJob(name=self.getJobName(length=totalJobs))
                            self.currentJob.addFile(f)
                            self.currentJob.addBaggageParameter("lheInputFiles", lheInput)
                            lumisPerJob = int(ceil(float(eventsPerJob)
                                                   / eventsPerLumi))
                            # Limit the number of events to a unsigned 32bit int
                            eventsRemaining = eventsInFile - totalEvents
                            if (currentEvent + eventsPerJob - 1) > (2 ** 32 - 1) and (
                                    currentEvent + eventsRemaining - 1) > (2 ** 32 - 1):
                                currentEvent = 1
                            if eventsRemaining > eventsPerJob:
                                self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob,
                                                                            currentEvent)
                                self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob,
                                                                           currentLumi)
                                jobTime = eventsPerJob * timePerEvent
                                diskRequired = eventsPerJob * sizePerEvent
                            else:
                                jobTime = eventsRemaining * timePerEvent
                                diskRequired = eventsRemaining * sizePerEvent
                                lumisPerJob = int(ceil(float(eventsRemaining) / eventsPerLumi))
                                self.currentJob["mask"].setMaxAndSkipEvents(eventsRemaining,
                                                                            currentEvent)
                                self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob,
                                                                           currentLumi)

                            if deterministicPileup:
                                self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)
                            currentLumi += lumisPerJob
                            currentEvent += eventsPerJob
                            totalEvents += eventsPerJob
                            totalJobs += 1
                            self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                                 memory=memoryRequirement,
                                                                 disk=diskRequired)
                    else:
                        self.newJob(name=self.getJobName(length=totalJobs))
                        self.currentJob.addFile(f)
                        # For MC we use firstEvent instead of skipEvents so set it to 1
                        # We must check for events going over 2**32 - 1 here too
                        if (eventsInFile + currentEvent - 1) > (2 ** 32 - 1):
                            currentEvent = 1
                        self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile,
                                                                    currentEvent)
                        self.currentJob["mask"].setMaxAndSkipLumis(lastLumi -
                                                                   currentLumi + 1, currentLumi)
                        jobTime = eventsInFile * timePerEvent
                        diskRequired = eventsInFile * sizePerEvent
                        self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                             memory=memoryRequirement,
                                                             disk=diskRequired)
                        if deterministicPileup:
                            self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)
                        totalJobs += 1
Exemplo n.º 57
0
    def algorithm(self, *args, **kwargs):
        """
        _algorithm_

        An event base splitting algorithm.  All available files are split into a
        set number of events per job.
        """
        eventsPerJob = int(kwargs.get("events_per_job", 100))
        eventsPerLumi = int(kwargs.get("events_per_lumi", eventsPerJob))
        getParents = kwargs.get("include_parents", False)
        lheInput = kwargs.get("lheInputFiles", False)
        collectionName = kwargs.get('collectionName', None)
        timePerEvent, sizePerEvent, memoryRequirement = \
            self.getPerformanceParameters(kwargs.get('performance', {}))
        acdcFileList = []
        deterministicPileup = kwargs.get('deterministicPileup', False)

        if eventsPerJob <= 0 or eventsPerLumi <= 0:
            msg = "events_per_job and events_per_lumi must be positive. Their values are: "
            msg += "events_per_job: %d, events_per_lumi: %d" % (eventsPerJob, eventsPerLumi)
            raise RuntimeError(msg)

        if deterministicPileup and self.package == 'WMCore.WMBS':
            getJobNumber = self.daoFactory(classname="Jobs.GetNumberOfJobsPerWorkflow")
            self.nJobs = getJobNumber.execute(workflow=self.subscription.getWorkflow().id)
            logging.info('Creating jobs in DeterministicPileup mode for %s',
                         self.subscription.workflowName())

        # If we have runLumi info, we need to load it from couch
        if collectionName:
            try:
                from WMCore.ACDC.DataCollectionService import DataCollectionService
                couchURL = kwargs.get('couchURL')
                couchDB = kwargs.get('couchDB')
                filesetName = kwargs.get('filesetName')
                collectionName = kwargs.get('collectionName')
                logging.info('Loading ACDC info for collectionName: %s, with filesetName: %s', collectionName,
                             filesetName)
                dcs = DataCollectionService(couchURL, couchDB)
                acdcFileList = dcs.getProductionACDCInfo(collectionName, filesetName)
            except Exception as ex:
                msg = "Exception while trying to load goodRunList\n"
                msg += "Refusing to create any jobs.\n"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logging.error(msg)
                return

        totalJobs = 0

        locationDict = self.sortByLocation()
        for location in locationDict:
            self.newGroup()
            fileList = locationDict[location]
            getRunLumiInformation = False
            for f in fileList:
                if f['lfn'].startswith("MCFakeFile"):
                    # We have one MCFakeFile, then it needs run information
                    getRunLumiInformation = True
                    break
            if getRunLumiInformation:
                if self.package == 'WMCore.WMBS':
                    loadRunLumi = self.daoFactory(classname="Files.GetBulkRunLumi")
                    fileLumis = loadRunLumi.execute(files=fileList)
                    if not fileLumis:
                        logging.warning("Empty fileLumis dict for workflow %s, subs %s.",
                                        self.subscription.workflowName(), self.subscription['id'])
                    for f in fileList:
                        lumiDict = fileLumis.get(f['id'], {})
                        for run in lumiDict:
                            f.addRun(run=Run(run, *lumiDict[run]))

            for f in fileList:
                currentEvent = f['first_event']
                eventsInFile = f['events']
                runs = list(f['runs'])
                # We got the runs, clean the file.
                f['runs'] = set()

                if getParents:
                    parentLFNs = self.findParent(lfn=f['lfn'])
                    for lfn in parentLFNs:
                        parent = File(lfn=lfn)
                        f['parents'].add(parent)

                if acdcFileList:
                    totalJobs = self.createACDCJobs(f, acdcFileList, timePerEvent,
                                                    sizePerEvent, memoryRequirement,
                                                    lheInput, eventsPerJob, eventsPerLumi,
                                                    deterministicPileup, totalJobs)
                    continue
                if not f['lfn'].startswith("MCFakeFile"):
                    # there might be files with 0 event that still have to be processed
                    if eventsInFile == 0:
                        self.newJob(name=self.getJobName(length=totalJobs))
                        self.currentJob.addFile(f)
                        # Do not set LastEvent
                        self.currentJob["mask"].setMaxAndSkipEvents(None, currentEvent)
                        self.currentJob.addResourceEstimates(jobTime=0,
                                                             memory=memoryRequirement,
                                                             disk=0)
                        if deterministicPileup:
                            self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)
                        totalJobs += 1
                        logging.info("Job created for 0-event input file with %s", self.currentJob)
                    # Very very uncommon in production, but it has real input dataset
                    while eventsInFile:
                        self.newJob(name=self.getJobName(length=totalJobs))
                        self.currentJob.addFile(f)
                        if eventsInFile >= eventsPerJob:
                            jobTime = eventsPerJob * timePerEvent
                            diskRequired = eventsPerJob * sizePerEvent
                            self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob - 1, currentEvent)
                        else:
                            jobTime = eventsInFile * timePerEvent
                            diskRequired = eventsInFile * sizePerEvent
                            self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile - 1, currentEvent)
                            eventsInFile = eventsPerJob
                        self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                             memory=memoryRequirement,
                                                             disk=diskRequired)
                        if deterministicPileup:
                            self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)

                        eventsInFile -= eventsPerJob
                        currentEvent += eventsPerJob
                        totalJobs += 1
                        logging.debug("Job created for real input with %s", self.currentJob)
                else:
                    # This assumes there's only one run which is the case for MC
                    lumis = runs[0].lumis
                    (firstLumi, lastLumi) = (min(lumis), max(lumis))
                    currentLumi = firstLumi
                    lumisPerJob = int(ceil(float(eventsPerJob) / eventsPerLumi))

                    while eventsInFile:
                        self.newJob(name=self.getJobName(length=totalJobs))
                        self.currentJob.addFile(f)
                        self.currentJob.addBaggageParameter("lheInputFiles", lheInput)

                        # Limit the number of events to a unsigned 32bit int
                        if (currentEvent + eventsPerJob - 1) > (2 ** 32 - 1) and \
                                        (currentEvent + eventsInFile) > (2 ** 32 - 1):
                            currentEvent = 1

                        if eventsInFile >= eventsPerJob:
                            jobTime = eventsPerJob * timePerEvent
                            diskRequired = eventsPerJob * sizePerEvent
                            # Alan on 16/Apr/2019: inclusiveMask must be a real inclusiveMask, thus
                            # FirstEvent/FirstLumi and LastEvent/LastLumi are also processed by the job
                            self.currentJob["mask"].setMaxAndSkipEvents(eventsPerJob - 1, currentEvent)
                            self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob - 1, currentLumi)
                        else:
                            jobTime = eventsInFile * timePerEvent
                            diskRequired = eventsInFile * sizePerEvent
                            lumisPerJob = int(ceil(float(eventsInFile) / eventsPerLumi))
                            self.currentJob["mask"].setMaxAndSkipEvents(eventsInFile - 1, currentEvent)
                            self.currentJob["mask"].setMaxAndSkipLumis(lumisPerJob - 1, currentLumi)
                            eventsInFile = eventsPerJob

                        self.currentJob.addResourceEstimates(jobTime=jobTime,
                                                             memory=memoryRequirement,
                                                             disk=diskRequired)
                        if deterministicPileup:
                            self.currentJob.addBaggageParameter("skipPileupEvents", (self.nJobs - 1) * eventsPerJob)

                        eventsInFile -= eventsPerJob
                        currentEvent += eventsPerJob
                        currentLumi += lumisPerJob
                        totalJobs += 1
                        logging.info("Job created with mask: %s", self.currentJob['mask'])

        return
Exemplo n.º 58
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)
        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        return

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        jobList = [
            job for job in jobList
            if job['type'] not in ['LogCollect', 'Cleanup']
        ]

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed",
                     len(jobList), state)
        retrydoneJobs = []
        cooloffJobs = []
        passJobs = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'],
                                                 self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                logging.debug("JobInfo: %s", job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(
                cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs,
                                       'retrydone',
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs,
                                       '%scooloff' % state,
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs", len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []

        if self.reqAuxDB:
            self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName).get("NoRetryExitCodes", [])

        for job in jobList:
            report = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error(
                    "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.",
                    job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error(
                    "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.",
                    job['id'], reportPath)
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                # correct the location if the original location is different from recorded in wmbs
                # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this.
                # If location in wmbs needs to be updated, it should happen in JobAccountant.
                locationFromFWJR = report.getSiteName()
                if locationFromFWJR:
                    job["location"] = locationFromFWJR
                    job["site_cms_name"] = locationFromFWJR

                if startTime is None or stopTime is None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i",
                                  job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (
                        job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([
                        x for x in report.getExitCodes()
                        if x in self.exitCodesNoRetry
                ]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    exhaustJobs.append(job)
                    continue

                if len(
                    [x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.debug(msg)
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning(
                    "Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d retry done jobs", len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()

        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d failures", len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList),
                         state)
            while len(idList) > 0:
                tmpList = idList[:self.maxProcessSize]
                idList = idList[self.maxProcessSize:]
                jobList = self.loadJobsFromList(tmpList)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        while len(idList) > 0:
            tmpList = idList[:self.maxProcessSize]
            idList = idList[self.maxProcessSize:]
            jobList = self.loadJobsFromList(tmpList)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """
        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})
        results = self.idLoad.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        try:
            myThread = threading.currentThread()
            self.handleErrors()
        except (CouchConnectionError, HTTPException) as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. "
            msg += "Transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.error(msg)
        except Exception as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught unexpected exception in ErrorHandler:\n"
            msg += str(ex)
            logging.exception(msg)
            raise ErrorHandlerException(msg)
Exemplo n.º 59
0
class AccountantWorker(WMConnectionBase):
    """
    Class that actually does the work of parsing FWJRs for the Accountant
    Run through ProcessPool
    """
    def __init__(self, config):
        """
        __init__

        Create all DAO objects that are used by this class.
        """
        WMConnectionBase.__init__(self, "WMCore.WMBS")
        myThread = threading.currentThread()
        self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer",
                                        logger = myThread.logger,
                                        dbinterface = myThread.dbi)

        self.getOutputMapAction      = self.daofactory(classname = "Jobs.GetOutputMap")
        self.bulkAddToFilesetAction  = self.daofactory(classname = "Fileset.BulkAddByLFN")
        self.bulkParentageAction     = self.daofactory(classname = "Files.AddBulkParentage")
        self.getJobTypeAction        = self.daofactory(classname = "Jobs.GetType")
        self.getParentInfoAction     = self.daofactory(classname = "Files.GetParentInfo")
        self.setParentageByJob       = self.daofactory(classname = "Files.SetParentageByJob")
        self.setParentageByMergeJob  = self.daofactory(classname = "Files.SetParentageByMergeJob")
        self.setFileRunLumi          = self.daofactory(classname = "Files.AddRunLumi")
        self.setFileLocation         = self.daofactory(classname = "Files.SetLocationByLFN")
        self.setFileAddChecksum      = self.daofactory(classname = "Files.AddChecksumByLFN")
        self.addFileAction           = self.daofactory(classname = "Files.Add")
        self.jobCompleteInput        = self.daofactory(classname = "Jobs.CompleteInput")
        self.setBulkOutcome          = self.daofactory(classname = "Jobs.SetOutcomeBulk")
        self.getWorkflowSpec         = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask")
        self.getJobInfoByID          = self.daofactory(classname = "Jobs.LoadFromID")
        self.getFullJobInfo          = self.daofactory(classname = "Jobs.LoadForErrorHandler")
        self.getJobTaskNameAction    = self.daofactory(classname = "Jobs.GetFWJRTaskName")

        self.dbsStatusAction       = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus")
        self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus")
        self.dbsChildrenAction     = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren")
        self.dbsCreateFiles        = self.dbsDaoFactory(classname = "DBSBufferFiles.Add")
        self.dbsSetLocation        = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN")
        self.dbsInsertLocation     = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation")
        self.dbsSetChecksum        = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN")
        self.dbsSetRunLumi         = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi")
        self.dbsGetWorkflow        = self.dbsDaoFactory(classname = "ListWorkflow")

        self.dbsLFNHeritage      = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent")

        self.stateChanger = ChangeState(config)

        # Decide whether or not to attach jobReport to returned value
        self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False)

        # Store location for the specs for DBS
        self.specDir = getattr(config.JobAccountant, 'specDir', None)

        # ACDC service
        self.dataCollection = DataCollectionService(url = config.ACDC.couchurl,
                                                    database = config.ACDC.database)

        jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url']
        jobDBName = config.JobStateMachine.couchDBName
        jobCouchdb  = CouchServer(jobDBurl)
        self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName)
        self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent")

        # Hold data for later commital
        self.dbsFilesToCreate  = []
        self.wmbsFilesToBuild  = []
        self.wmbsMergeFilesToBuild  = []
        self.fileLocation      = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave  = []
        self.listOfJobsToFail  = []
        self.filesetAssoc      = []
        self.parentageBinds    = []
        self.parentageBindsForMerge    = []
        self.jobsWithSkippedFiles = {}
        self.count = 0
        self.datasetAlgoID     = collections.deque(maxlen = 1000)
        self.datasetAlgoPaths  = collections.deque(maxlen = 1000)
        self.dbsLocations      = set()
        self.workflowIDs       = collections.deque(maxlen = 1000)
        self.workflowPaths     = collections.deque(maxlen = 1000)

        self.phedex = PhEDEx()
        self.locLists = self.phedex.getNodeMap()


        return

    def reset(self):
        """
        _reset_

        Reset all global vars between runs.
        """
        self.dbsFilesToCreate  = []
        self.wmbsFilesToBuild  = []
        self.wmbsMergeFilesToBuild  = []
        self.fileLocation      = None
        self.mergedOutputFiles = []
        self.listOfJobsToSave  = []
        self.listOfJobsToFail  = []
        self.filesetAssoc      = []
        self.parentageBinds    = []
        self.parentageBindsForMerge = []
        self.jobsWithSkippedFiles = {}
        gc.collect()
        return

    def loadJobReport(self, parameters):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        jobReportPath = parameters.get("fwjr_path", None)
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://","")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception as ex:
            msg =  "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            logging.debug("Failing job: %s\n" % parameters)
            return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport')

        if len(jobReport.listSteps()) == 0:
            logging.error("FwkJobReport with no steps: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath)

        return jobReport

    def isTaskExistInFWJR(self, jobReport, jobStatus):
        """
        If taskName is not available in the FWJR, then tries to
        recover it getting data from the SQL database.
        """
        if not jobReport.getTaskName():
            logging.warning("Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus,
                                                                                                jobReport.getJobID()))
            jobInfo = self.getJobTaskNameAction.execute(jobId = jobReport.getJobID(),
                                                        conn = self.getDBConn(),
                                                        transaction = self.existingTransaction())

            jobReport.setTaskName(jobInfo['taskName'])
            jobReport.save(jobInfo['fwjr_path'])
            if not jobReport.getTaskName():
                msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (jobStatus, 
                                                                                                   jobReport.getJobID())
                raise AccountantWorkerException(msg)
            else:
                logging.info("TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(),
                                                                                                jobReport.getJobID()))

        return

    def __call__(self, parameters):
        """
        __call__

        Handle a completed job.  The parameters dictionary will contain the job
        ID and the path to the framework job report.
        """
        returnList = []
        self.reset()

        for job in parameters:
            logging.info("Handling %s" % job["fwjr_path"])

            # Load the job and set the ID
            fwkJobReport = self.loadJobReport(job)
            fwkJobReport.setJobID(job['id'])
            
            jobSuccess = self.handleJob(jobID = job["id"],
                                        fwkJobReport = fwkJobReport)

            if self.returnJobReport:
                returnList.append({'id': job["id"], 'jobSuccess': jobSuccess,
                                   'jobReport': fwkJobReport})
            else:
                returnList.append({'id': job["id"], 'jobSuccess': jobSuccess})

            self.count += 1

        self.beginTransaction()

        # Now things done at the end of the job
        # Do what we can with WMBS files
        self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds)
        
        # handle merge files separately since parentage need to set 
        # separately to support robust merge
        self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge)
        
        # Create DBSBufferFiles
        self.createFilesInDBSBuffer()

        # Handle filesetAssoc
        if len(self.filesetAssoc) > 0:
            self.bulkAddToFilesetAction.execute(binds = self.filesetAssoc,
                                                conn = self.getDBConn(),
                                                transaction = self.existingTransaction())

        # Move successful jobs to successful
        if len(self.listOfJobsToSave) > 0:
            idList = [x['id'] for x in self.listOfJobsToSave]
            outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToSave]
            self.setBulkOutcome.execute(binds = outcomeBinds,
                                    conn = self.getDBConn(),
                                    transaction = self.existingTransaction())

            self.jobCompleteInput.execute(id = idList,
                                          lfnsToSkip = self.jobsWithSkippedFiles,
                                          conn = self.getDBConn(),
                                          transaction = self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete")

        # If we have failed jobs, fail them
        if len(self.listOfJobsToFail) > 0:
            outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToFail]
            self.setBulkOutcome.execute(binds = outcomeBinds,
                                        conn = self.getDBConn(),
                                        transaction = self.existingTransaction())
            self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete")

        # Arrange WMBS parentage
        if len(self.parentageBinds) > 0:
            self.setParentageByJob.execute(binds = self.parentageBinds,
                                           conn = self.getDBConn(),
                                           transaction = self.existingTransaction())
        if len(self.parentageBindsForMerge) > 0:
            self.setParentageByMergeJob.execute(binds = self.parentageBindsForMerge,
                                           conn = self.getDBConn(),
                                           transaction = self.existingTransaction())

        # Straighten out DBS Parentage
        if len(self.mergedOutputFiles) > 0:
            self.handleDBSBufferParentage()

        if len(self.jobsWithSkippedFiles) > 0:
            self.handleSkippedFiles()

        self.commitTransaction(existingTransaction = False)

        return returnList

    def outputFilesetsForJob(self, outputMap, merged, moduleLabel):
        """
        _outputFilesetsForJob_

        Determine if the file should be placed in any other fileset.  Note that
        this will not return the JobGroup output fileset as all jobs will have
        their output placed there.
        """
        if moduleLabel not in outputMap:
            logging.info("Output module label missing from output map.")
            return []

        outputFilesets = []
        for outputFileset in outputMap[moduleLabel]:
            if merged == False and outputFileset["output_fileset"] != None:
                outputFilesets.append(outputFileset["output_fileset"])
            else:
                if outputFileset["merged_output_fileset"] != None:
                    outputFilesets.append(outputFileset["merged_output_fileset"])

        return outputFilesets

    def addFileToDBS(self, jobReportFile, task):
        """
        _addFileToDBS_

        Add a file that was output from a job to the DBS buffer.
        """
        datasetInfo = jobReportFile["dataset"]

        dbsFile = DBSBufferFile(lfn = jobReportFile["lfn"],
                                size = jobReportFile["size"],
                                events = jobReportFile["events"],
                                checksums = jobReportFile["checksums"],
                                status = "NOTUPLOADED")
        dbsFile.setAlgorithm(appName = datasetInfo["applicationName"],
                             appVer = datasetInfo["applicationVersion"],
                             appFam = jobReportFile["module_label"],
                             psetHash = "GIBBERISH",
                             configContent = jobReportFile.get('configURL'))

        dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"],
                                              datasetInfo["processedDataset"],
                                              datasetInfo["dataTier"]))
        dbsFile.setValidStatus(validStatus = jobReportFile.get("validStatus", None))
        dbsFile.setProcessingVer(ver = jobReportFile.get('processingVer', None))
        dbsFile.setAcquisitionEra(era = jobReportFile.get('acquisitionEra', None))
        dbsFile.setGlobalTag(globalTag = jobReportFile.get('globalTag', None))
        #TODO need to find where to get the prep id
        dbsFile.setPrepID(prep_id = jobReportFile.get('prep_id', None))
        dbsFile['task'] = task

        for run in jobReportFile["runs"]:
            newRun = Run(runNumber = run.run)
            newRun.extend(run.lumis)
            dbsFile.addRun(newRun)


        dbsFile.setLocation(pnn = list(jobReportFile["locations"])[0], immediateSave = False)
        self.dbsFilesToCreate.append(dbsFile)
        return

    def findDBSParents(self, lfn):
        """
        _findDBSParents_

        Find the parent of the file in DBS
        This is meant to be called recursively
        """
        parentsInfo = self.getParentInfoAction.execute([lfn],
                                                       conn = self.getDBConn(),
                                                       transaction = self.existingTransaction())
        newParents = set()
        for parentInfo in parentsInfo:
            # This will catch straight to merge files that do not have redneck
            # parents.  We will mark the straight to merge file from the job
            # as a child of the merged parent.
            if int(parentInfo["merged"]) == 1:
                newParents.add(parentInfo["lfn"])

            elif parentInfo['gpmerged'] == None:
                continue

            # Handle the files that result from merge jobs that aren't redneck
            # children.  We have to setup parentage and then check on whether or
            # not this file has any redneck children and update their parentage
            # information.
            elif int(parentInfo["gpmerged"]) == 1:
                newParents.add(parentInfo["gplfn"])

            # If that didn't work, we've reached the great-grandparents
            # And we have to work via recursion
            else:
                parentSet = self.findDBSParents(lfn = parentInfo['gplfn'])
                for parent in parentSet:
                    newParents.add(parent)

        return newParents

    def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID = None):
        """
        _addFileToWMBS_

        Add a file that was produced in a job to WMBS.
        """
        fwjrFile["first_event"] = jobMask["FirstEvent"]

        if fwjrFile["first_event"] == None:
            fwjrFile["first_event"] = 0

        if jobType == "Merge" and fwjrFile["module_label"] != "logArchive":
            setattr(fwjrFile["fileRef"], 'merged', True)
            fwjrFile["merged"] = True

        wmbsFile = self.createFileFromDataStructsFile(file = fwjrFile, jobID = jobID)
        
        if jobType == "Merge":
            self.wmbsMergeFilesToBuild.append(wmbsFile)
        else:
            self.wmbsFilesToBuild.append(wmbsFile)
            
        if fwjrFile["merged"]:
            self.addFileToDBS(fwjrFile, task)

        return wmbsFile


    def _mapLocation(self, fwkJobReport):
        for file in fwkJobReport.getAllFileRefs():
            if file and hasattr(file, 'location'):
                file.location = self.phedex.getBestNodeName(file.location, self.locLists)


    def handleJob(self, jobID, fwkJobReport):
        """
        _handleJob_

        Figure out if a job was successful or not, handle it appropriately
        (parse FWJR, update WMBS) and return the success status as a boolean

        """
        jobSuccess = fwkJobReport.taskSuccessful()

        outputMap = self.getOutputMapAction.execute(jobID = jobID,
                                                    conn = self.getDBConn(),
                                                    transaction = self.existingTransaction())

        jobType = self.getJobTypeAction.execute(jobID = jobID,
                                                conn = self.getDBConn(),
                                                transaction = self.existingTransaction())

        if jobSuccess:
            fileList = fwkJobReport.getAllFiles()

            # consistency check comparing outputMap to fileList
            # they should match except for some limited special cases
            outputModules = set([])
            for fwjrFile in fileList:
                outputModules.add(fwjrFile['outputModule'])
            if set(outputMap.keys()) == outputModules:
                pass
            elif jobType == "LogCollect" and len(outputMap.keys()) == 0 and outputModules == set(['LogCollect']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['Merged', 'logArchive']):
                pass
            elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['MergedError', 'logArchive']):
                pass
            elif jobType == "Express" and set(outputMap.keys()).difference(outputModules) == set(['write_RAW']):
                pass
            else:
                failJob = True
                if jobType in [ "Processing", "Production" ]:
                    cmsRunSteps = 0
                    for step in fwkJobReport.listSteps():
                        if step.startswith("cmsRun"):
                            cmsRunSteps += 1
                    if cmsRunSteps > 1:
                        failJob = False

                if failJob:
                    jobSuccess = False
                    logging.error("Job %d , list of expected outputModules does not match job report, failing job", jobID)
                    logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys()))
                    logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules))
                    fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1')
                else:
                    logging.debug("Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID)
        else:
            fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1')

        if jobSuccess:
            logging.info("Job %d , handle successful job", jobID)
        else:
            logging.error("Job %d , bad jobReport, failing job",  jobID)

        # make sure the task name is present in FWJR (recover from WMBS if needed)
        if len(fileList) > 0:
            if jobSuccess:
                self.isTaskExistInFWJR(fwkJobReport, "success")
            else:
                self.isTaskExistInFWJR(fwkJobReport, "failed")

        # special check for LogCollect jobs
        skipLogCollect = False
        if jobSuccess and jobType == "LogCollect":
            for fwjrFile in fileList:
                try:
                    # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes
                    self.associateLogCollectToParentJobsInWMStats(fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName())
                except Exception as ex:
                    skipLogCollect = True
                    logging.error("Error occurred: associating log collect location, will try again\n %s" % str(ex))
                    break

        # now handle the job (unless the special LogCollect check failed)
        if not skipLogCollect:

            wmbsJob = Job(id = jobID)
            wmbsJob.load()
            outputID = wmbsJob.loadOutputID()
            wmbsJob.getMask()

            wmbsJob["fwjr"] = fwkJobReport

            if jobSuccess:
                wmbsJob["outcome"] = "success"
            else:
                wmbsJob["outcome"] = "failure"

            for fwjrFile in fileList:

                logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"])

                wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"],
                                              jobID = jobID, task = fwkJobReport.getTaskName())
                merged = fwjrFile['merged']
                moduleLabel = fwjrFile["module_label"]

                if merged:
                    self.mergedOutputFiles.append(wmbsFile)

                self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputID})

                # LogCollect jobs have no output fileset
                if jobType != "LogCollect":
                    outputFilesets = self.outputFilesetsForJob(outputMap, merged, moduleLabel)
                    for outputFileset in outputFilesets:
                        self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputFileset})

            # Check if the job had any skipped files, put them in ACDC containers
            # We assume full file processing (no job masks)
            if jobSuccess:
                skippedFiles = fwkJobReport.getAllSkippedFiles()
                if skippedFiles:
                    self.jobsWithSkippedFiles[jobID] = skippedFiles

            # Only save once job is done, and we're sure we made it through okay
            self._mapLocation(wmbsJob['fwjr'])
            if jobSuccess:
                self.listOfJobsToSave.append(wmbsJob)
            else:
                self.listOfJobsToFail.append(wmbsJob)

        return jobSuccess
    
    def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task):
        """
        _associateLogCollectToParentJobsInWMStats_

        Associate a logArchive output to its parent job
        """
        inputFileList = fwkJobReport.getAllInputFiles()
        requestName = task.split('/')[1]
        keys = []
        for inputFile in inputFileList:
            keys.append([requestName, inputFile["lfn"]])
        resultRows = self.fwjrCouchDB.loadView("FWJRDump", 'jobsByOutputLFN', 
                                               options = {"stale": "update_after"},
                                               keys = keys)['rows']
        if len(resultRows) > 0:
            #get data from wmbs
            parentWMBSJobIDs = []
            for row in resultRows:
                parentWMBSJobIDs.append({"jobid": row["value"]})
            #update Job doc in wmstats
            results = self.getJobInfoByID.execute(parentWMBSJobIDs)
            parentJobNames = []
            
            if isinstance(results, list):
                for jobInfo in results:
                    parentJobNames.append(jobInfo['name'])
            else:
                parentJobNames.append(results['name'])
            
            self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN)
        else:
            #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0)
            #It need to be failed and retried.
            logging.error("job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList)

        return

    def createMissingFWKJR(self, parameters, errorCode = 999,
                           errorDescription = 'Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report

    def createFilesInDBSBuffer(self):
        """
        _createFilesInDBSBuffer_
        It does the actual job of creating things in DBSBuffer
        WARNING: This assumes all files in a job have the same final location
        """
        if len(self.dbsFilesToCreate) == 0:
            # Whoops, nothing to do!
            return

        dbsFileTuples = []
        dbsFileLoc    = []
        dbsCksumBinds = []
        runLumiBinds  = []
        selfChecksums = None
        jobLocations  = set()

        for dbsFile in self.dbsFilesToCreate:
            # Append a tuple in the format specified by DBSBufferFiles.Add
            # Also run insertDatasetAlgo

            assocID         = None
            datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (dbsFile['datasetPath'],
                                                           dbsFile["appName"],
                                                           dbsFile["appVer"],
                                                           dbsFile["appFam"],
                                                           dbsFile["psetHash"],
                                                           dbsFile['processingVer'],
                                                           dbsFile['acquisitionEra'],
                                                           dbsFile['globalTag'])
            # First, check if this is in the cache
            if datasetAlgoPath in self.datasetAlgoPaths:
                for da in self.datasetAlgoID:
                    if da['datasetAlgoPath'] == datasetAlgoPath:
                        assocID = da['assocID']
                        break

            if not assocID:
                # Then we have to get it ourselves
                try:
                    assocID = dbsFile.insertDatasetAlgo()
                    self.datasetAlgoPaths.append(datasetAlgoPath)
                    self.datasetAlgoID.append({'datasetAlgoPath': datasetAlgoPath,
                                               'assocID': assocID})
                except WMException:
                    raise
                except Exception as ex:
                    msg =  "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath
                    msg += str(ex)
                    logging.error(msg)
                    raise AccountantWorkerException(msg)

            # Associate the workflow to the file using the taskPath and the requestName
            # TODO: debug why it happens and then drop/recover these cases automatically
            taskPath = dbsFile.get('task')
            if not taskPath:
                msg = "Can't do workflow association, report this error to a developer.\n"
                msg += "DbsFile : %s" % str(dbsFile)
                raise AccountantWorkerException(msg)
            workflowName = taskPath.split('/')[1]
            workflowPath = '%s:%s' % (workflowName, taskPath)
            if workflowPath in self.workflowPaths:
                for wf in self.workflowIDs:
                    if wf['workflowPath'] == workflowPath:
                        workflowID = wf['workflowID']
                        break
            else:
                result = self.dbsGetWorkflow.execute(workflowName, taskPath, conn = self.getDBConn(),
                                                         transaction = self.existingTransaction())
                workflowID = result['id']

            self.workflowPaths.append(workflowPath)
            self.workflowIDs.append({'workflowPath': workflowPath, 'workflowID': workflowID})

            lfn           = dbsFile['lfn']
            selfChecksums = dbsFile['checksums']
            jobLocation   = dbsFile.getLocations()[0]
            jobLocations.add(jobLocation)
            dbsFileTuples.append((lfn, dbsFile['size'],
                                  dbsFile['events'], assocID,
                                  dbsFile['status'], workflowID))

            dbsFileLoc.append({'lfn': lfn, 'sename' : jobLocation})
            if dbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']})

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    dbsCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry],
                                          'cktype' : entry})

        try:
            
            diffLocation = jobLocations.difference(self.dbsLocations)

            for jobLocation in diffLocation:
                self.dbsInsertLocation.execute(siteName = jobLocation,
                                               conn = self.getDBConn(),
                                               transaction = self.existingTransaction())
                self.dbsLocations.add(jobLocation)

            self.dbsCreateFiles.execute(files = dbsFileTuples,
                                        conn = self.getDBConn(),
                                        transaction = self.existingTransaction())

            self.dbsSetLocation.execute(binds = dbsFileLoc,
                                        conn = self.getDBConn(),
                                        transaction = self.existingTransaction())

            self.dbsSetChecksum.execute(bulkList = dbsCksumBinds,
                                        conn = self.getDBConn(),
                                        transaction = self.existingTransaction())

            if len(runLumiBinds) > 0:
                self.dbsSetRunLumi.execute(file = runLumiBinds,
                                           conn = self.getDBConn(),
                                           transaction = self.existingTransaction())
        except WMException:
            raise
        except Exception as ex:
            msg =  "Got exception while inserting files into DBSBuffer!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Listing binds:")
            logging.debug("jobLocation: %s\n" % jobLocation)
            logging.debug("dbsFiles: %s\n" % dbsFileTuples)
            logging.debug("dbsFileLoc: %s\n" %dbsFileLoc)
            logging.debug("Checksum binds: %s\n" % dbsCksumBinds)
            logging.debug("RunLumi binds: %s\n" % runLumiBinds)
            raise AccountantWorkerException(msg)


        # Now that we've created those files, clear the list
        self.dbsFilesToCreate = []
        return


    def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds):
        """
        _handleWMBSFiles_

        Do what can be done in bulk in bulk
        """
        if len(wmbsFilesToBuild) == 0:
            # Nothing to do
            return

        runLumiBinds   = []
        fileCksumBinds = []
        fileLocations  = []
        fileCreate     = []

        for wmbsFile in wmbsFilesToBuild:
            lfn           = wmbsFile['lfn']
            if lfn == None:
                continue

            selfChecksums = wmbsFile['checksums']
            # by jobType add to different parentage relation
            # if it is the merge job, don't include the parentage on failed input files.
            # otherwise parentage is set for all input files.
            parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']})
                
            if wmbsFile['runs']:
                runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']})

            if len(wmbsFile.getLocations()) > 0:
                fileLocations.append({'lfn': lfn, 'location': wmbsFile.getLocations()[0]})

            if selfChecksums:
                # If we have checksums we have to create a bind
                # For each different checksum
                for entry in selfChecksums.keys():
                    fileCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry],
                                           'cktype' : entry})

            fileCreate.append([lfn,
                               wmbsFile['size'],
                               wmbsFile['events'],
                               None,
                               wmbsFile["first_event"],
                               wmbsFile['merged']])

        if len(fileCreate) == 0:
            return

        try:

            self.addFileAction.execute(files = fileCreate,
                                       conn = self.getDBConn(),
                                       transaction = self.existingTransaction())

            if runLumiBinds:
                self.setFileRunLumi.execute(file = runLumiBinds,
                                            conn = self.getDBConn(),
                                            transaction = self.existingTransaction())

            self.setFileAddChecksum.execute(bulkList = fileCksumBinds,
                                            conn = self.getDBConn(),
                                            transaction = self.existingTransaction())

            self.setFileLocation.execute(lfn = fileLocations,
                                         location = self.fileLocation,
                                         conn = self.getDBConn(),
                                         transaction = self.existingTransaction())


        except WMException:
            raise
        except Exception as ex:
            msg =  "Error while adding files to WMBS!\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Printing binds: \n")
            logging.debug("FileCreate binds: %s\n" % fileCreate)
            logging.debug("Runlumi binds: %s\n" % runLumiBinds)
            logging.debug("Checksum binds: %s\n" % fileCksumBinds)
            logging.debug("FileLocation binds: %s\n" % fileLocations)
            raise AccountantWorkerException(msg)

        # Clear out finished files
        wmbsFilesToBuild = []
        return

    def createFileFromDataStructsFile(self, file, jobID):
        """
        _createFileFromDataStructsFile_

        This function will create a WMBS File given a DataStructs file
        """
        wmbsFile = File()
        wmbsFile.update(file)

        if isinstance(file["locations"], set):
            pnn = list(file["locations"])[0]
        elif isinstance(file["locations"], list):
            if len(file['locations']) > 1:
                logging.error("Have more then one location for a file in job %i" % (jobID))
                logging.error("Choosing location %s" % (file['locations'][0]))
            pnn = file["locations"][0]
        else:
            pnn = file["locations"]

        wmbsFile["locations"] = set()

        if pnn != None:
            wmbsFile.setLocation(pnn = pnn, immediateSave = False)
        wmbsFile['jid'] = jobID
        
        return wmbsFile

    def handleDBSBufferParentage(self):
        """
        _handleDBSBufferParentage_

        Handle all the DBSBuffer Parentage in bulk if you can
        """
        outputLFNs = [f['lfn'] for f in self.mergedOutputFiles]
        bindList         = []
        for lfn in outputLFNs:
            newParents = self.findDBSParents(lfn = lfn)
            for parentLFN in newParents:
                bindList.append({'child': lfn, 'parent': parentLFN})

        # Now all the parents should exist
        # Commit them to DBSBuffer
        logging.info("About to commit all DBSBuffer Heritage information")
        logging.info(len(bindList))

        if len(bindList) > 0:
            try:
                self.dbsLFNHeritage.execute(binds = bindList,
                                            conn = self.getDBConn(),
                                            transaction = self.existingTransaction())
            except WMException:
                raise
            except Exception as ex:
                msg =  "Error while trying to handle the DBS LFN heritage\n"
                msg += str(ex)
                msg += "BindList: %s" % bindList
                logging.error(msg)
                raise AccountantWorkerException(msg)
        return

    def handleSkippedFiles(self):
        """
        _handleSkippedFiles_

        Handle all the skipped files in bulk,
        the way it handles the skipped files
        imposes an important restriction:
        Skipped files should have been processed by a single job
        in the task and no job mask exists in it.
        This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased
        splitting algorithms.
        Here ACDC records and created and the file are moved
        to wmbs_sub_files_failed from completed.
        """
        jobList = self.getFullJobInfo.execute([{'jobid' : x} for x in self.jobsWithSkippedFiles.keys()],
                                              fileSelection = self.jobsWithSkippedFiles,
                                              conn = self.getDBConn(),
                                              transaction = self.existingTransaction())
        self.dataCollection.failedJobs(jobList, useMask = False)
        return