def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states and perform proper actions with the jobs, according to the state """ timeNow = int(time.time()) state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303} executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if jobInfo: bossAir = BossAirAPI(self.config) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode) ercode = state2ExitCode.get(state, 71300) bossAir.kill(jobtokill, errorCode=ercode) # only now that jobs were updated by the plugin, we flip the site state setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, stateTime=timeNow, conn=self.getDBConn(), transaction=self.existingTransaction()) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, conn=self.getDBConn(), transaction=self.existingTransaction()) executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if not jobInfo: # then no jobs to look at return bossAir = BossAirAPI(self.config, noSetup=True) jobtokill = bossAir.updateSiteInformation( jobInfo, siteName, state in ("Aborted", "Draining", "Down")) if state == "Aborted": ercode = 71301 elif state == "Draining": ercode = 71302 elif state == "Down": ercode = 71303 else: ercode = 71300 bossAir.kill(jobtokill, errorCode=ercode) return
def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config, insertStates=True) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(results[0]['Pending'], nJobs) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states and perform proper actions with the jobs, according to the state """ state2ExitCode = {"Aborted": 71301, "Draining": 71302, "Down": 71303} executingJobs = self.wmbsDAOFactory(classname="Jobs.ListByState") jobInfo = executingJobs.execute(state='executing') if jobInfo: bossAir = BossAirAPI(self.config, noSetup=True) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in state2ExitCode) ercode = state2ExitCode.get(state, 71300) bossAir.kill(jobtokill, errorCode=ercode) # only now that jobs were updated by the plugin, we flip the site state setStateAction = self.wmbsDAOFactory(classname="Locations.SetState") setStateAction.execute(siteName=siteName, state=state, conn=self.getDBConn(), transaction=self.existingTransaction()) return
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState") setStateAction.execute(siteName = siteName, state = state, conn = self.getDBConn(), transaction = self.existingTransaction()) executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByState") jobInfo = executingJobs.execute(state = 'executing') if not jobInfo: # then no jobs to look at return bossAir = BossAirAPI(self.config, noSetup = True) jobtokill = bossAir.updateSiteInformation(jobInfo, siteName, state in ("Aborted","Draining","Down")) if state == "Aborted": ercode=71301 elif state == "Draining": ercode=71302 elif state == "Down": ercode=71303 else: ercode=71300 bossAir.kill(jobtokill, errorCode=ercode) return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") existingTransaction = False if myThread.transaction.conn: existingTransaction = True else: myThread.transaction.begin() killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn, transaction=True) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) changeState.propagate(liveWMBSJob, "killed", liveJob["state"]) killableJobs.append(liveJob) # Now kill them try: bossAir.kill(jobs=killableJobs) except BossAirException, ex: # Something's gone wrong # Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. pass
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig = None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) killFilesAction = daoFactory(classname = "Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname = "Jobs.KillWorkflow") existingTransaction = False if myThread.transaction.conn: existingTransaction = True else: myThread.transaction.begin() killFilesAction.execute(workflowName = workflowName, conn = myThread.transaction.conn, transaction = True) liveJobs = killJobsAction.execute(workflowName = workflowName, conn = myThread.transaction.conn, transaction = True) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config = bossAirConfig, noSetup = True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id = liveJob["id"]) liveWMBSJob.update(liveJob) changeState.propagate(liveWMBSJob, "killed", liveJob["state"]) killableJobs.append(liveJob) # Now kill them try: bossAir.kill(jobs = killableJobs) except BossAirException, ex: # Something's gone wrong # Jobs not killed! logging.error("Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. pass
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() #DAO factory for WMBS objects self.daoFactory = DAOFactory(package = "WMCore.WMBS", \ logger = logging, dbinterface = myThread.dbi) self.config = config #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) # BossAir self.bossAir = BossAirAPI(config=self.config) # Additions for caching-based JobSubmitter self.workflowTimestamps = {} self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.siteKeys = {} self.locationDict = {} self.cmsNames = {} self.drainSites = [] self.sortedSites = [] self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) # initialize the alert framework (if available) self.initAlerts(compName="JobSubmitter") try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception, ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) self.sendAlert(6, msg=msg) try: logging.debug("PackageDir: %s" % self.packageDir) logging.debug("Config: %s" % config) except: pass raise JobSubmitterPollerException(msg)
def changeSiteState(self, siteName, state): """ _changeSiteState_ Set a site to some of the possible states, if the state is Aborted we must do extra actions. """ setStateAction = self.wmbsDAOFactory(classname = "Locations.SetState") setStateAction.execute(siteName = siteName, state = state, conn = self.getDBConn(), transaction = self.existingTransaction()) if state == "Aborted" and self.config: # Kill all jobs in the batch system assigned to this site executingJobs = self.wmbsDAOFactory(classname = "Jobs.ListByStateAndLocation") jobIds = executingJobs.execute(state = 'executing', location = siteName) bossAir = BossAirAPI(self.config, noSetup = True) bossAir.kill(jobIds, errorCode = 61301) return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from PyCondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill( jtok, errorCode=61301 ) # errorCode can be either 61301/61302/61303 (Aborted/Draining/Down) return
def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ baAPI = BossAirAPI(config=self.config, insertStates=True) # Create nine jobs self.setupForKillTest(baAPI=baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", self.config, self.config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) return
def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() del jobSubmitter return
def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config = config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'mnorman' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs = jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") # initialize the alert framework (if available) self.initAlerts(compName="JobTracker")
def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') baAPI.kill(jobs=idleJobs) del jobSubmitter return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testG_monitoringDAO(self): """ _monitoringDAO_ Because I need a test for the monitoring DAO """ return myThread = threading.currentThread() config = self.getConfig() changeState = ChangeState(config) baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'mnorman' job['location'] = 'T2_US_UCSD' job.save() baAPI.submit(jobs=jobDummies) results = baAPI.monitor() self.assertEqual(len(results), nJobs) for job in results: self.assertEqual(job['plugin'], 'CondorPlugin') return
def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ configFile = EmulatorSetup.setupWMAgentConfig() config = loadConfigurationFile(configFile) baAPI = BossAirAPI(config = config) # Create nine jobs self.setupForKillTest(baAPI = baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", config, config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) EmulatorSetup.deleteConfig(configFile) return
def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ configFile = EmulatorSetup.setupWMAgentConfig() config = loadConfigurationFile(configFile) baAPI = BossAirAPI(config=config) # Create nine jobs self.setupForKillTest(baAPI=baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", config, config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) EmulatorSetup.deleteConfig(configFile) return
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join(self.testDir, 'workloadTest', workloadName), site="se.T2_US_UCSD") for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') ## # Make one of the sites in the sitelist to be True for ABORTED/DRAINING/DOWN # updateSiteInformation() method should edit the classAd for all the jobs # that are bound for the site # Check the Q manually using condor_q -l <job id> # jtok = baAPI.updateSiteInformation(idleJobs, "T2_US_UCSD", True) if jtok != None: baAPI.kill(jtok, errorCode=71301) # errorCode can be either 71301/71302/71303 (Aborted/Draining/Down) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'PyCondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') baAPI.kill(jobs = idleJobs) del jobSubmitter return
def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus")
def testT_updateJobInfo(self): """ _updateJobInfo_ Test the updateSiteInformation method from CondorPlugin.py """ nRunning = getCondorRunningJobs(self.user) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' baAPI = BossAirAPI(config = config) baAPI.track() idleJobs = baAPI._loadByStatus(status = 'Idle') print idleJobs for job in idleJobs : print job['id'] baAPI.updateSiteInformation(idleJobs, info = None) return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) try: logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) except: pass raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} return
def testG_gLiteTest(self): """ _gLiteTest_ This test works on the gLitePlugin, checking all of its functions with a single set of jobs """ config = self.getConfig() config.BossAir.gliteConf = '/afs/cern.ch/cms/LCG/LCG-2/UI/conf/glite_wms_CERN.conf' config.BossAir.credentialDir = '/home/crab/ALL_SETUP/credentials/' config.BossAir.gLiteProcesses = 2 config.BossAir.gLitePrefixEnv = "/lib64/" config.BossAir.pluginNames.append("gLitePlugin") config.BossAir.manualProxyPath = environ['X509_USER_PROXY'] config.Agent.serverDN = "/we/bypass/myproxy/logon" #config.BossAir.pluginNames = ["gLitePlugin"] baAPI = BossAirAPI(config=config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs=nJobs, location='grid-ce-01.ba.infn.it') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] userdn = executeCommand('grid-cert-info -subject -file %s' % config.BossAir.manualProxyPath) newuser = self.daoFactory(classname="Users.New") newuser.execute(dn=userdn) for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'grid-ce-01.ba.infn.it'} job['location'] = 'grid-ce-01.ba.infn.it' job['plugin'] = 'gLitePlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = userdn job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs=jobList) # Should be new jobs newJobs = baAPI._loadByStatus(status='New') self.assertNotEqual(len(newJobs), nJobs) time.sleep(2) baAPI.track() # Should be not anymore marked as new newJobs = baAPI._loadByStatus('New', 0) self.assertNotEqual(len(newJobs), nJobs) # Killing all the jobs baAPI.kill(jobList) #time.sleep(15) baAPI.track() ## Issues running tests below due to glite delay on marking job as killed # Should be just running jobs #killedJobs = baAPI._loadByStatus('Cancelled by user', 0) #self.assertEqual(len(killedJobs), 0) # Check if they're complete #completeJobs = baAPI.getComplete() #self.assertEqual(len(completeJobs), nJobs) return
def testD_MyProxyDelegation(self): """ _MyProxyDelegation_ Test whether we can delegate a proxy via myproxy to this job IMPORTANT: If you are going to run this test you will have to set the serverCert/Key config options to point to your local server cert. You will also have to run this job with your DN. I don't recommend figuring out how to do this without knowing what you're doing in regards to proxy stuff. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing proxyDir = os.path.join(self.testDir, 'proxyDir') os.mkdir(proxyDir) config = self.getConfig() config.BossAir.removeTime = -10.0 config.BossAir.pluginNames.append('VanillaCondorPlugin') config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem' config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem' config.BossAir.myproxyServer = 'myproxy.cern.ch' config.BossAir.proxyDir = proxyDir config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a' userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632' nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) baAPI = BossAirAPI(config = config) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'VanillaCondorPlugin' tmpJob['owner'] = userDN tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs = jobList, info = info) proxyFile = os.listdir(proxyDir)[0] stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \ % os.path.join(proxyDir, proxyFile)) self.assertEqual(stdout.split('\n')[0], 'subject : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN) # Now kill 'em manually command = ['condor_rm', self.user] SubprocessAlgos.runCommand(cmd = command, shell = False) return
def testF_WMSMode(self): """ _WMSMode_ Try running things in WMS Mode. """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' config.BossAir.submitWMSMode = True baAPI = BossAirAPI(config=config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site=None) for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) baAPI.track() idleJobs = baAPI._loadByStatus(status='Idle') sn = "T2_US_UCSD" # Test the Site Info has been updated. Make Sure T2_US_UCSD is not in the sitelist # in BossAir_t.py baAPI.updateSiteInformation(idleJobs, sn, True) # Now kill 'em manually # command = ['condor_rm', self.user] # pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) # pipe.communicate() del jobSubmitter return
def testC_CondorTest(self): """ _CondorTest_ This test works on the SimpleCondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) baAPI = BossAirAPI(config=config, insertStates=True) jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'SimpleCondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} # info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs=jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs=jobList, info=info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout=PIPE, stderr=PIPE, shell=False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) # newJobs = baAPI._loadByStatus(status = 'Removed') # self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status='Removed', complete='0') self.assertEqual(len(newJobs), nJobs) return
def testC_CondorTest(self): """ _CondorTest_ This test works on the CondorPlugin, checking all of its functions with a single set of jobs """ nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) # Get the config and set the removal time to -10 for testing config = self.getConfig() config.BossAir.removeTime = -10.0 nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) baAPI = BossAirAPI(config = config) print self.testDir jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: tmpJob = {'id': j['id']} tmpJob['custom'] = {'location': 'malpaquet'} tmpJob['name'] = j['name'] tmpJob['cache_dir'] = self.testDir tmpJob['retry_count'] = 0 tmpJob['plugin'] = 'CondorPlugin' tmpJob['owner'] = 'tapas' tmpJob['packageDir'] = self.testDir tmpJob['sandbox'] = sandbox tmpJob['priority'] = None tmpJob['usergroup'] = "wheel" tmpJob['userrole'] = 'cmsuser' jobList.append(tmpJob) info = {} #info['packageDir'] = self.testDir info['index'] = 0 info['sandbox'] = sandbox baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Do a second time to make sure that the cache # doesn't die on us baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) baAPI.kill(jobs = jobList) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 baAPI.submit(jobs = jobList, info = info) nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nJobs) # Now kill 'em manually command = ['condor_rm', self.user] pipe = Popen(command, stdout = PIPE, stderr = PIPE, shell = False) pipe.communicate() # See what happened baAPI.track() newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), 0) #newJobs = baAPI._loadByStatus(status = 'Removed') #self.assertEqual(len(newJobs), nJobs) # Because removal time is -10.0, jobs should remove immediately baAPI.track() # Assert that jobs were listed as completed myThread = threading.currentThread() newJobs = baAPI._loadByStatus(status = 'Removed', complete = '0') self.assertEqual(len(newJobs), nJobs) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts', {}) # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if self.timeouts == {}: # Then we've set outself to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero" % job['id']) continue if timeout != None and statusTime != None: if time.time() - float(statusTime) > float(timeout): # Then the job needs to be killed. logging.info("Killing job %i because it has exceeded timeout for status %s" % (job['id'], globalState)) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[61304], errorCode=61304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config = config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs = jobDummies) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs = jobDummies) result = myThread.dbi.processData("SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config = config) # We should have loaded a plugin self.assertTrue('TestPlugin' in baAPI.plugins.keys()) result = myThread.dbi.processData("SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs = nJobs) print jobDummies baAPI.createNewJobs(wmbsJobs = jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), 0) raisesException = False self.assertRaises(BossAirException, baAPI._loadByStatus, status = 'FalseStatus') # Change the job status and update it for job in newJobs: job['status'] = 'Dead' baAPI._updateJobs(jobs = newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs = deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs = jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs = deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status = 'Dead') self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return
class JobUpdaterPoller(BaseWorkerThread): """ _JobUpdaterPoller_ Poller class for the JobUpdater """ def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory( classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory( classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory( classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus") def setup(self, parameters=None): """ _setup_ """ pass def terminate(self, parameters=None): """ _terminate_ Terminate gracefully. """ pass @timeFunction def algorithm(self, parameters=None): """ _algorithm_ """ try: logging.info("Synchronizing priorities with ReqMgr...") self.synchronizeJobPriority() logging.info( "Priorities were synchronized, wait until the next cycle") except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except CouchConflictError as ex: msg = "Caught CouchConflictError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: if 'Connection refused' in str(ex): logging.warn( "Failed to sync priorities. Trying in the next cycle") else: msg = "Caught unexpected exception in JobUpdater: %s\n" % str( ex) logging.exception(msg) raise JobUpdaterException(msg) def synchronizeJobPriority(self): """ _synchronizeJobPriority_ Check WMBS and WorkQueue for active workflows and compare with the ReqMgr for priority changes. If a priority change occurs then update the job priority in the batch system and the elements in the local queue that have not been injected yet. """ # Update the priority of workflows that are not in WMBS and just in local queue priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()] for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: result = self.reqmgr2.getRequestByNames(workflow)[0] priorityCache[workflow] = result[workflow][ 'RequestPriority'] except Exception as ex: logging.error( "Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow] logging.info("Found %d workflows to update in workqueue", len(workflowsToUpdate)) for workflow in workflowsToUpdate: self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow]) # Check the workflows in WMBS priorityCache = {} workflowsToUpdateWMBS = {} workflowsToCheck = self.listWorkflowsDAO.execute() for workflowEntry in workflowsToCheck: workflow = workflowEntry['name'] if workflow not in priorityCache: try: result = self.reqmgr2.getRequestByNames(workflow)[0] priorityCache[workflow] = result[workflow][ 'RequestPriority'] except Exception as ex: logging.error( "Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue requestPriority = int(priorityCache[workflow]) if requestPriority != int(workflowEntry['workflow_priority']): # Update the workqueue priority for the Available elements self.workqueue.updatePriority(workflow, requestPriority) # Check if there are executing jobs for this particular task if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0: self.bossAir.updateJobInformation( workflow, workflowEntry['task'], requestPriority=priorityCache[workflow], taskPriority=workflowEntry['task_priority']) workflowsToUpdateWMBS[workflow] = priorityCache[workflow] if workflowsToUpdateWMBS: logging.info("Updating %d workflows in WMBS.", len(workflowsToUpdateWMBS)) self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except Exception as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) try: logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) except: pass raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') return @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = defaultdict(list) # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info("Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill[globalState].append(job) timeOutCodeMap = {"Running": 71304, "Pending": 71305, "Error": 71306} # We need to show that the jobs are in state timeout # and then kill them. jobsToKillList = flattenList(jobsToKill.values()) myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKillList) for preJobStatus in jobsToKill: eCode = timeOutCodeMap.get(preJobStatus, 71307) # it shouldn't have 71307 (states should be among Running, Pending, Error) self.bossAir.kill(jobs=jobsToKill[preJobStatus], killMsg=WM_JOB_ERROR_CODES[eCode], errorCode=eCode) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config # DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) # Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config, insertStates=True) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsToCache = int(getattr(self.config.JobSubmitter, 'maxJobsToCache', 50000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') self.drainGracePeriod = getattr(self.config.JobSubmitter, 'drainGraceTime', 2 * 24 * 60 * 60) # 2 days # Used for speed draining the agent self.enableAllSites = False # Additions for caching-based JobSubmitter self.jobsByPrio = {} # key'ed by the final job priority, which contains a set of job ids self.jobDataCache = {} # key'ed by the job id, containing the whole job info dict self.jobsToPackage = {} self.locationDict = {} self.drainSites = dict() self.drainSitesSet = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache() self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return
class JobTrackerPoller(BaseWorkerThread): """ _JobTrackerPoller_ Polls the BossAir database for complete jobs Handles completed jobs """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=config) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.jobListAction = self.daoFactory(classname="Jobs.GetAllJobs") self.setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath") def setup(self, parameters=None): """ Load DB objects required for queries """ return def terminate(self, params=None): """ _terminate_ Terminate the function after one more run. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) return @timeFunction def algorithm(self, parameters=None): """ Performs the archiveJobs method, looking for each type of failure And deal with it as desired. """ logging.info("Running Tracker algorithm") myThread = threading.currentThread() try: self.trackJobs() except WMException: if getattr(myThread, 'transaction', None): myThread.transaction.rollback() raise except Exception as ex: msg = "Unknown exception in JobTracker!\n" msg += str(ex) if getattr(myThread, 'transaction', None): myThread.transaction.rollback() logging.error(msg) raise JobTrackerException(msg) return def trackJobs(self): """ _trackJobs_ Finds a list of running jobs and the sites that they're running at, and passes that off to tracking. """ passedJobs = [] failedJobs = [] jobList = self.jobListAction.execute(state="executing") logging.info("Have list of %i executing jobs in WMBS", len(jobList)) if not jobList: return # retrieve completed jobs from BossAir that are 'executing' in WMBS completeJobs = self.bossAir.getComplete() logging.info( "Have list of %i jobs complete in BossAir but executing in WMBS", len(completeJobs)) logging.debug(completeJobs) for job in completeJobs: if job['id'] not in jobList: logging.error( "Found a complete job in BossAir without a correspondent in WMBS!" ) continue if job['status'].lower() == 'timeout': failedJobs.append(job) else: passedJobs.append(job) # Assume all these jobs "passed" if they aren't in timeout self.passJobs(passedJobs) self.failJobs(failedJobs) return def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ if len(failedJobs) == 0: return jrBinds = [] for job in failedJobs: # Make sure the job object goes packed with fwjr_path to be persisted in couch jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) fwjr = Report() try: fwjr.load(jrPath) except Exception: # Something went wrong reading the pickle logging.error( "The pickle in %s could not be loaded, generating a new one", jrPath) fwjr = Report() fwjr.addError("NoJobReport", 99303, "NoJobReport", WM_JOB_ERROR_CODES[99303]) fwjr.save(jrPath) job["fwjr"] = fwjr myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs", len(failedJobs)) myThread.transaction.commit() return def passJobs(self, passedJobs): """ _passJobs_ Pass jobs and move their stuff? """ if len(passedJobs) == 0: return jrBinds = [] for job in passedJobs: jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(passedJobs, 'complete', 'executing') myThread.transaction.commit() logging.info("Passed %i jobs", len(passedJobs)) return
class StatusPoller(BaseWorkerThread): """ _StatusPoller_ Prototype for polling for JobStatusAir """ def __init__(self, config): """ __init__ Set up the caching and other objects """ self.config = config BaseWorkerThread.__init__(self) self.cachedJobs = [] self.bossAir = BossAirAPI(config=config) # With no timeouts, nothing ever happens # Otherwise we expect a dictionary with the keys representing # the states and the values the timeouts. self.timeouts = getattr(config.JobStatusLite, 'stateTimeouts') # init alert system self.initAlerts(compName="StatusPoller") return def algorithm(self, parameters=None): """ _algorithm_ Handle any exceptions with the actual code """ myThread = threading.currentThread() try: logging.info("Running job status poller algorithm...") self.checkStatus() except WMException as ex: if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() self.sendAlert(6, msg=str(ex)) raise except Exception as ex: msg = "Unhandled error in statusPoller" msg += str(ex) logging.exception(msg) self.sendAlert(6, msg=msg) if getattr(myThread, 'transaction', None): myThread.transaction.rollbackForError() raise StatusPollerException(msg) return def checkStatus(self): """ _checkStatus_ Run the BossAir track() function (self-contained) and then check for jobs that have timed out. """ runningJobs = self.bossAir.track() if len(runningJobs) < 1: # Then we have no jobs return if not self.timeouts: # Then we've set ourselves to have no timeouts # Get out and stay out return # Look for jobs that need to be killed jobsToKill = [] # Now check for timeouts for job in runningJobs: globalState = job.get('globalState', 'Error') statusTime = job.get('status_time', None) timeout = self.timeouts.get(globalState, None) if statusTime == 0: logging.error("Not killing job %i, the status time was zero", job['id']) continue if timeout and statusTime: if time.time() - float(statusTime) > float(timeout): # Timeout status is used by JobTracker to fail jobs in WMBS database logging.info( "Killing job %i because it has exceeded timeout for status '%s'", job['id'], globalState) job['status'] = 'Timeout' jobsToKill.append(job) # We need to show that the jobs are in state timeout # and then kill them. myThread = threading.currentThread() myThread.transaction.begin() self.bossAir.update(jobs=jobsToKill) self.bossAir.kill(jobs=jobsToKill, killMsg=WM_JOB_ERROR_CODES[71304], errorCode=71304) myThread.transaction.commit() return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache( ) self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return
def testD_PrototypeChain(self): """ _PrototypeChain_ Prototype the BossAir workflow """ myThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'CondorPlugin' baAPI = BossAirAPI(config = config) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs = nSubs, nJobs = nJobs, task = workload.getTask("ReReco"), workloadSpec = os.path.join(self.testDir, 'workloadTest', workloadName), site = 'se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config = config) jobTracker = JobTrackerPoller(config = config) statusPoller = StatusPoller(config = config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nSubs * nJobs) # Check WMBS getJobsAction = self.daoFactory(classname = "Jobs.GetAllJobs") result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) statusPoller.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), nSubs * nJobs) # Tracker should do nothing jobTracker.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) # Wait for jobs to timeout due to short Pending wait period time.sleep(12) statusPoller.algorithm() newJobs = baAPI._loadByStatus(status = 'Idle') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status = 'Timeout', complete = '0') self.assertEqual(len(newJobs), nSubs * nJobs) # Jobs should be gone nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nSubs * nJobs) # Because they timed out, they all should have failed jobTracker.algorithm() result = getJobsAction.execute(state = 'Executing', jobType = "Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state = 'JobFailed', jobType = "Processing") self.assertEqual(len(result), nSubs * nJobs) return
class JobSubmitterPoller(BaseWorkerThread): """ _JobSubmitterPoller_ The jobSubmitterPoller takes the jobs and organizes them into packages before sending them to the individual plugin submitters. """ def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache() else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return def getPackageCollection(self, sandboxDir): """ _getPackageCollection_ Given a jobID figure out which packageCollection it should belong in. """ rawList = os.listdir(sandboxDir) collections = [] numberList = [] for entry in rawList: if 'PackageCollection' in entry: collections.append(entry) # If we have no collections, return 0 (PackageCollection_0) if len(collections) < 1: return 0 # Loop over the list of PackageCollections for collection in collections: collectionPath = os.path.join(sandboxDir, collection) packageList = os.listdir(collectionPath) collectionNum = int(collection.split('_')[1]) if len(packageList) < self.collSize: return collectionNum else: numberList.append(collectionNum) # If we got here, then all collections are full. We'll need # a new one. Find the highest number, increment by one numberList.sort() return numberList[-1] + 1 def addJobsToPackage(self, loadedJob): """ _addJobsToPackage_ Add a job to a job package and then return the batch ID for the job. Packages are only written out to disk when they contain 100 jobs. The flushJobsPackages() method must be called after all jobs have been added to the cache and before they are actually submitted to make sure all the job packages have been written to disk. """ if loadedJob["workflow"] not in self.jobsToPackage: # First, let's pull all the information from the loadedJob batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"]) sandboxDir = os.path.dirname(loadedJob["sandbox"]) # Second, assemble the jobPackage location collectionIndex = self.getPackageCollection(sandboxDir) collectionDir = os.path.join(sandboxDir, 'PackageCollection_%i' % collectionIndex, 'batch_%s' % batchid) # Now create the package object self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid, 'id': loadedJob['id'], "package": JobPackage(directory=collectionDir)} jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"] jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob() batchDir = jobPackage['directory'] if len(jobPackage.keys()) == self.packageSize: if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[loadedJob["workflow"]] return batchDir def flushJobPackages(self): """ _flushJobPackages_ Write any jobs packages to disk that haven't been written out already. """ workflowNames = self.jobsToPackage.keys() for workflowName in workflowNames: jobPackage = self.jobsToPackage[workflowName]["package"] batchDir = jobPackage['directory'] if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[workflowName] return def refreshCache(self): """ _refreshCache_ Query WMBS for all jobs in the 'created' state. For all jobs returned from the query, check if they already exist in the cache. If they don't, unpickle them and combine their site white and black list with the list of locations they can run at. Add them to the cache. Each entry in the cache is a tuple with five items: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory """ badJobs = dict([(x, []) for x in range(71101, 71105)]) dbJobs = set() logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs)) if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \ self.refreshPollingCount >= self.skipRefreshCount: newJobs = self.listJobsAction.execute() self.refreshPollingCount = 0 if self.useReqMgrForCompletionCheck: # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() else: #T0Agent abortedAndForceCompleteRequests = [] logging.info("Found %s new jobs to be submitted.", len(newJobs)) else: self.refreshPollingCount += 1 newJobs = [] dbJobs = self.cachedJobIDs abortedAndForceCompleteRequests = [] logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs)) logging.info("Determining possible sites for new jobs...") jobCount = 0 for newJob in newJobs: # whether newJob belongs to aborted or force-complete workflow, and skip it if it is. if (newJob['request_name'] in abortedAndForceCompleteRequests) and \ (newJob['type'] not in ['LogCollect', "Cleanup"]): continue jobID = newJob['id'] dbJobs.add(jobID) if jobID in self.cachedJobIDs: continue jobCount += 1 if jobCount % 5000 == 0: logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs)) pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl") if not os.path.isfile(pickledJobPath): # Then we have a problem - there's no file logging.error("Could not find pickled jobObject %s", pickledJobPath) badJobs[71103].append(newJob) continue try: jobHandle = open(pickledJobPath, "r") loadedJob = pickle.load(jobHandle) jobHandle.close() except Exception as ex: msg = "Error while loading pickled job object %s\n" % pickledJobPath msg += str(ex) logging.error(msg) raise JobSubmitterPollerException(msg) loadedJob['retry_count'] = newJob['retry_count'] # figure out possible locations for job possibleLocations = loadedJob["possiblePSN"] # Create another set of locations that may change when a site goes white/black listed # Does not care about the non_draining or aborted sites, they may change and that is the point potentialLocations = set() potentialLocations.update(possibleLocations) # now check for sites in drain and adjust the possible locations # also check if there is at least one site left to run the job if len(possibleLocations) == 0: newJob['name'] = loadedJob['name'] newJob['fileLocations'] = loadedJob.get('fileLocations', []) newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', []) newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', []) badJobs[71101].append(newJob) continue else: nonAbortSites = [x for x in possibleLocations if x not in self.abortSites] if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job possibleLocations = nonAbortSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71102].append(newJob) continue # try to remove draining sites if possible, this is needed to stop # jobs that could run anywhere blocking draining sites # if the job type is Merge, LogCollect or Cleanup this is skipped if newJob['type'] not in ('LogCollect', 'Merge', 'Cleanup', 'Harvesting'): nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites] if nonDrainingSites: # if >1 viable non-draining site remove draining ones possibleLocations = nonDrainingSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71104].append(newJob) continue # locations clear of abort and draining sites newJob['possibleLocations'] = possibleLocations batchDir = self.addJobsToPackage(loadedJob) self.cachedJobIDs.add(jobID) # calculate the final job priority such that we can order cached jobs by prio jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority'] if jobPrio not in self.cachedJobs: self.cachedJobs[jobPrio] = {} # now add basic information keyed by the jobid self.cachedJobs[jobPrio][jobID] = newJob # allow job baggage to override numberOfCores # => used for repacking to get more slots/disk numberOfCores = loadedJob.get('numberOfCores', 1) if numberOfCores == 1: baggage = loadedJob.getBaggage() numberOfCores = getattr(baggage, "numberOfCores", 1) loadedJob['numberOfCores'] = numberOfCores # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob) jobInfo = {'id': jobID, 'requestName': newJob['request_name'], 'taskName': newJob['task_name'], 'taskType': newJob['type'], 'cache_dir': newJob["cache_dir"], 'priority': newJob['wf_priority'], 'taskID': newJob['task_id'], 'retry_count': newJob["retry_count"], 'taskPriority': None, # update from the thresholds 'custom': {'location': None}, # update later 'packageDir': batchDir, 'sandbox': loadedJob["sandbox"], # remove before submit 'userdn': loadedJob.get("ownerDN", None), 'usergroup': loadedJob.get("ownerGroup", ''), 'userrole': loadedJob.get("ownerRole", ''), 'possibleSites': frozenset(possibleLocations), # abort and drain sites filtered out 'potentialSites': frozenset(potentialLocations), # original list of sites 'scramArch': loadedJob.get("scramArch", None), 'swVersion': loadedJob.get("swVersion", None), 'name': loadedJob["name"], 'proxyPath': loadedJob.get("proxyPath", None), 'estimatedJobTime': loadedJob.get("estimatedJobTime", None), 'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None), 'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None), 'numberOfCores': loadedJob.get("numberOfCores", 1), # may update it later 'inputDataset': loadedJob.get('inputDataset', None), 'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None), 'allowOpportunistic': loadedJob.get('allowOpportunistic', False)} self.jobDataCache[jobID] = jobInfo # Register failures in submission for errorCode in badJobs: if badJobs[errorCode]: logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode) self._handleSubmitFailedJobs(badJobs[errorCode], errorCode) # If there are any leftover jobs, we want to get rid of them. self.flushJobPackages() # We need to remove any jobs from the cache that were not returned in # the last call to the database. jobIDsToPurge = self.cachedJobIDs - dbJobs self._purgeJobsFromCache(jobIDsToPurge) logging.info("Done pruning killed jobs, moving on to submit.") return def removeAbortedForceCompletedWorkflowFromCache(self): abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() jobIDsToPurge = set() for jobID, jobInfo in self.jobDataCache.iteritems(): if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \ (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]): jobIDsToPurge.add(jobID) self._purgeJobsFromCache(jobIDsToPurge) return def _purgeJobsFromCache(self, jobIDsToPurge): if len(jobIDsToPurge) == 0: return self.cachedJobIDs -= jobIDsToPurge for jobid in jobIDsToPurge: self.jobDataCache.pop(jobid, None) for jobPrio in self.cachedJobs: if self.cachedJobs[jobPrio].pop(jobid, None): # then the jobid was found, go to the next one break return def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: # This is temporary addition if this is patched for existing agent. # If jobs are created before the patch is applied fileLocations is not set. # TODO. remove this later for new agent job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": Job is created before this patch. Please check this input for the jobs: %s " % job['fwjr'].getAllInputFiles()) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath}) except IOError as ioer: logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return def getThresholds(self): """ _getThresholds_ Retrieve submit thresholds, which considers what is pending and running for those sites. Also update the list of draining and abort/down sites. Finally, creates a map between task type and its priority. """ self.taskTypePrioMap = {} newDrainSites = set() newAbortSites = set() rcThresholds = self.resourceControl.listThresholdsForSubmit() for siteName in rcThresholds.keys(): # Add threshold if we don't have it already state = rcThresholds[siteName]["state"] if state == "Draining": newDrainSites.add(siteName) if state in ["Down", "Aborted"]: newAbortSites.add(siteName) # then update the task type x task priority mapping if not self.taskTypePrioMap: for task, value in rcThresholds[siteName]['thresholds'].items(): self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority # When the list of drain/abort sites change between iteration then a location # refresh is needed, for now it forces a full cache refresh if newDrainSites != self.drainSites or newAbortSites != self.abortSites: logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.") self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.currentRcThresholds = rcThresholds self.abortSites = newAbortSites self.drainSites = newDrainSites return def assignJobLocations(self): """ _assignJobLocations_ Loop through the submit thresholds and pull sites out of the job cache as we discover open slots. This will return a list of tuple where each tuple will have six elements: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory - SE name of the site to run at """ jobsToSubmit = {} jobsToUncache = [] jobsCount = 0 exitLoop = False jobSubmitLogBySites = defaultdict(Counter) jobSubmitLogByPriority = defaultdict(Counter) # iterate over jobs from the highest to the lowest prio for jobPrio in sorted(self.cachedJobs, reverse=True): # then we're completely done and have our basket full of jobs to submit if exitLoop: break # start eating through the elder jobs first for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')): jobid = job['id'] jobType = job['type'] possibleSites = job['possibleLocations'] jobSubmitLogByPriority[jobPrio]['Total'] += 1 # now look for sites with free pending slots for siteName in possibleSites: if siteName not in self.currentRcThresholds: logging.warn("Have a job for %s which is not in the resource control", siteName) continue try: totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"] totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"] totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"] totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"] taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"] taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"] taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"] taskPriority = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"] except KeyError as ex: msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType) msg += str(ex) logging.error(msg) continue # check if site has free pending slots AND free pending task slots if totalPendingJobs >= totalPendingSlots or taskPendingJobs >= taskPendingSlots: jobSubmitLogBySites[siteName]["NoPendingSlot"] += 1 logging.debug("Found a job for %s which has no free pending slots", siteName) continue # check if site overall thresholds have free slots if totalPendingJobs + totalRunningJobs >= totalPendingSlots + totalRunningSlots: jobSubmitLogBySites[siteName]["NoRunningSlot"] += 1 logging.debug("Found a job for %s which has no free overall slots", siteName) continue # finally, check whether task has free overall slots if taskPendingJobs + taskRunningJobs >= taskPendingSlots + taskRunningSlots: jobSubmitLogBySites[siteName]["NoTaskSlot"] += 1 logging.debug("Found a job for %s which has no free task slots", siteName) continue # otherwise, update the site/task thresholds and the component job counter self.currentRcThresholds[siteName]["total_pending_jobs"] += 1 self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1 jobsCount += 1 # load (and remove) the job dictionary object from jobDataCache cachedJob = self.jobDataCache.pop(jobid) jobsToUncache.append((jobPrio, jobid)) # Sort jobs by jobPackage package = cachedJob['packageDir'] if package not in jobsToSubmit.keys(): jobsToSubmit[package] = [] # Add the sandbox to a global list self.sandboxPackage[package] = cachedJob.pop('sandbox') # Now update the job dictionary object cachedJob['custom'] = {'location': siteName} cachedJob['taskPriority'] = taskPriority # Get this job in place to be submitted by the plugin jobsToSubmit[package].append(cachedJob) jobSubmitLogBySites[siteName]["submitted"] += 1 jobSubmitLogByPriority[jobPrio]['submitted'] += 1 # found a site to submit this job, so go to the next job break # set the flag and get out of the job iteration if jobsCount >= self.maxJobsPerPoll: exitLoop = True break # jobs that are going to be submitted must be removed from all caches for prio, jobid in jobsToUncache: self.cachedJobs[prio].pop(jobid) self.cachedJobIDs.remove(jobid) logging.info("Site submission report: %s", dict(jobSubmitLogBySites)) logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority)) logging.info("Have %s packages to submit.", len(jobsToSubmit)) logging.info("Have %s jobs to submit.", jobsCount) logging.info("Done assigning site locations.") return jobsToSubmit def submitJobs(self, jobsToSubmit): """ _submitJobs_ Actually do the submission of the jobs """ jobList = [] idList = [] if len(jobsToSubmit) == 0: logging.debug("There are no packages to submit.") return for package in jobsToSubmit.keys(): sandbox = self.sandboxPackage[package] jobs = jobsToSubmit.get(package, []) for job in jobs: job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location']) job['sandbox'] = sandbox idList.append({'jobid': job['id'], 'location': job['custom']['location']}) #Clean out the package reference del self.sandboxPackage[package] jobList.extend(jobs) myThread = threading.currentThread() myThread.transaction.begin() # Run the actual underlying submit code using bossAir successList, failList = self.bossAir.submit(jobs=jobList) logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList)) # Propagate states in the WMBS database logging.debug("Propagating success state to WMBS.") self.changeState.propagate(successList, 'executing', 'created') logging.debug("Propagating fail state to WMBS.") self.changeState.propagate(failList, 'submitfailed', 'created') # At the end we mark the locations of the jobs # This applies even to failed jobs, since the location # could be part of the failure reason. logging.debug("Updating job location...") self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() logging.info("Transaction cycle successfully completed.") return def getSiteInfo(self, jobSite): """ _getSiteInfo_ This is how you get the name of a CE and the plugin for a job """ if not jobSite in self.locationDict.keys(): siteInfo = self.locationAction.execute(siteName=jobSite) self.locationDict[jobSite] = siteInfo[0] return (self.locationDict[jobSite].get('ce_name'), self.locationDict[jobSite].get('plugin'), self.locationDict[jobSite].get('cms_name')) def algorithm(self, parameters=None): """ _algorithm_ Try to, in order: 1) Refresh the cache 2) Find jobs for all the necessary sites 3) Submit the jobs to the plugin """ try: myThread = threading.currentThread() self.getThresholds() self.refreshCache() if self.useReqMgrForCompletionCheck: # only runs when reqmgr is used (not Tier0) self.removeAbortedForceCompletedWorkflowFromCache() jobsToSubmit = self.assignJobLocations() self.submitJobs(jobsToSubmit=jobsToSubmit) except WMException: if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise except Exception as ex: msg = 'Fatal error in JobSubmitter:\n' msg += str(ex) #msg += str(traceback.format_exc()) msg += '\n\n' logging.error(msg) if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise JobSubmitterPollerException(msg) return def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
class JobUpdaterPoller(BaseWorkerThread): """ _JobUpdaterPoller_ Poller class for the JobUpdater """ def __init__(self, config): """ __init__ """ BaseWorkerThread.__init__(self) self.config = config self.bossAir = BossAirAPI(config=self.config) self.reqmgr2 = ReqMgr(self.config.JobUpdater.reqMgr2Url) self.workqueue = WorkQueue(self.config.WorkQueueManager.couchurl, self.config.WorkQueueManager.dbname) myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.listWorkflowsDAO = self.daoFactory(classname="Workflow.ListForJobUpdater") self.updateWorkflowPrioDAO = self.daoFactory(classname="Workflow.UpdatePriority") self.executingJobsDAO = self.daoFactory(classname="Jobs.GetNumberOfJobsForWorkflowTaskStatus") def setup(self, parameters=None): """ _setup_ """ pass def terminate(self, parameters=None): """ _terminate_ Terminate gracefully. """ pass def algorithm(self, parameters=None): """ _algorithm_ """ try: logging.info("Synchronizing priorities with ReqMgr...") self.synchronizeJobPriority() logging.info("Priorities were synchronized, wait until the next cycle") except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except CouchConflictError as ex: msg = "Caught CouchConflictError exception in JobUpdater\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: if 'Connection refused' in str(ex): logging.warn("Failed to sync priorities. Trying in the next cycle") else: msg = "Caught unexpected exception in JobUpdater: %s\n" % str(ex) logging.exception(msg) raise JobUpdaterException(msg) def synchronizeJobPriority(self): """ _synchronizeJobPriority_ Check WMBS and WorkQueue for active workflows and compare with the ReqMgr for priority changes. If a priority change occurs then update the job priority in the batch system and the elements in the local queue that have not been injected yet. """ # Update the priority of workflows that are not in WMBS and just in local queue priorityCache = {} workflowsToUpdate = {} workflowsToCheck = [x for x in self.workqueue.getAvailableWorkflows()] for workflow, priority in workflowsToCheck: if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception as ex: logging.error("Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue if priority != priorityCache[workflow]: workflowsToUpdate[workflow] = priorityCache[workflow] logging.info("Found %d workflows to update in workqueue", len(workflowsToUpdate)) for workflow in workflowsToUpdate: self.workqueue.updatePriority(workflow, workflowsToUpdate[workflow]) # Check the workflows in WMBS priorityCache = {} workflowsToUpdateWMBS = {} workflowsToCheck = self.listWorkflowsDAO.execute() for workflowEntry in workflowsToCheck: workflow = workflowEntry['name'] if workflow not in priorityCache: try: priorityCache[workflow] = self.reqmgr2.getRequestByNames(workflow)[workflow]['RequestPriority'] except Exception as ex: logging.error("Couldn't retrieve the priority of request %s", workflow) logging.error("Error: %s", str(ex)) continue requestPriority = int(priorityCache[workflow]) if requestPriority != int(workflowEntry['workflow_priority']): # Update the workqueue priority for the Available elements self.workqueue.updatePriority(workflow, requestPriority) # Check if there are executing jobs for this particular task if self.executingJobsDAO.execute(workflow, workflowEntry['task']) > 0: self.bossAir.updateJobInformation(workflow, workflowEntry['task'], requestPriority=priorityCache[workflow], taskPriority=workflowEntry['task_priority']) workflowsToUpdateWMBS[workflow] = priorityCache[workflow] if workflowsToUpdateWMBS: logging.info("Updating %d workflows in WMBS.", len(workflowsToUpdateWMBS)) self.updateWorkflowPrioDAO.execute(workflowsToUpdateWMBS)
def testE_FullChain(self): """ _FullChain_ Full test going through the chain; using polling cycles and everything """ from WMComponent.JobSubmitter.JobSubmitter import JobSubmitter from WMComponent.JobStatusLite.JobStatusLite import JobStatusLite from WMComponent.JobTracker.JobTracker import JobTracker myThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'SimpleCondorPlugin' baAPI = BossAirAPI(config=config, insertStates=True) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 1 nJobs = 2 cacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site='se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitter(config=config) jobTracker = JobTracker(config=config) jobStatus = JobStatusLite(config=config) jobSubmitter.prepareToStart() jobTracker.prepareToStart() jobStatus.prepareToStart() # What should happen here: # 1) The JobSubmitter should submit the jobs # 2) Because of the ridiculously short time on pending jobs # the JobStatus poller should mark the jobs as done # and kill them. # 3) The JobTracker should realize there are finished jobs # # So at the end of several polling cycles, the jobs should all # be done, but be in the failed status (they timed out) time.sleep(20) myThread.workerThreadManager.terminateWorkers() getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='JobFailed', jobType="Processing") self.assertEqual(len(result), nJobs * nSubs) return
class JobSubmitterPoller(BaseWorkerThread): """ _JobSubmitterPoller_ The jobSubmitterPoller takes the jobs and organizes them into packages before sending them to the individual plugin submitters. """ def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int( getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int( getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int( getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory( classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory( classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory( classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr( self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache( ) self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return def getPackageCollection(self, sandboxDir): """ _getPackageCollection_ Given a jobID figure out which packageCollection it should belong in. """ rawList = os.listdir(sandboxDir) collections = [] numberList = [] for entry in rawList: if 'PackageCollection' in entry: collections.append(entry) # If we have no collections, return 0 (PackageCollection_0) if len(collections) < 1: return 0 # Loop over the list of PackageCollections for collection in collections: collectionPath = os.path.join(sandboxDir, collection) packageList = os.listdir(collectionPath) collectionNum = int(collection.split('_')[1]) if len(packageList) < self.collSize: return collectionNum else: numberList.append(collectionNum) # If we got here, then all collections are full. We'll need # a new one. Find the highest number, increment by one numberList.sort() return numberList[-1] + 1 def addJobsToPackage(self, loadedJob): """ _addJobsToPackage_ Add a job to a job package and then return the batch ID for the job. Packages are only written out to disk when they contain 100 jobs. The flushJobsPackages() method must be called after all jobs have been added to the cache and before they are actually submitted to make sure all the job packages have been written to disk. """ if loadedJob["workflow"] not in self.jobsToPackage: # First, let's pull all the information from the loadedJob batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"]) sandboxDir = os.path.dirname(loadedJob["sandbox"]) # Second, assemble the jobPackage location collectionIndex = self.getPackageCollection(sandboxDir) collectionDir = os.path.join( sandboxDir, 'PackageCollection_%i' % collectionIndex, 'batch_%s' % batchid) # Now create the package object self.jobsToPackage[loadedJob["workflow"]] = { "batchid": batchid, 'id': loadedJob['id'], "package": JobPackage(directory=collectionDir) } jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"] jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob() batchDir = jobPackage['directory'] if len(jobPackage.keys()) == self.packageSize: if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[loadedJob["workflow"]] return batchDir def flushJobPackages(self): """ _flushJobPackages_ Write any jobs packages to disk that haven't been written out already. """ workflowNames = self.jobsToPackage.keys() for workflowName in workflowNames: jobPackage = self.jobsToPackage[workflowName]["package"] batchDir = jobPackage['directory'] if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[workflowName] return def refreshCache(self): """ _refreshCache_ Query WMBS for all jobs in the 'created' state. For all jobs returned from the query, check if they already exist in the cache. If they don't, unpickle them and combine their site white and black list with the list of locations they can run at. Add them to the cache. Each entry in the cache is a tuple with five items: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory """ badJobs = dict([(x, []) for x in range(71101, 71105)]) dbJobs = set() logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs)) if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \ self.refreshPollingCount >= self.skipRefreshCount: newJobs = self.listJobsAction.execute() self.refreshPollingCount = 0 if self.useReqMgrForCompletionCheck: # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData( ) else: #T0Agent abortedAndForceCompleteRequests = [] logging.info("Found %s new jobs to be submitted.", len(newJobs)) else: self.refreshPollingCount += 1 newJobs = [] dbJobs = self.cachedJobIDs abortedAndForceCompleteRequests = [] logging.info( "Skipping cache update to be submitted. (%s job in cache)", len(dbJobs)) logging.info("Determining possible sites for new jobs...") jobCount = 0 for newJob in newJobs: # whether newJob belongs to aborted or force-complete workflow, and skip it if it is. if (newJob['request_name'] in abortedAndForceCompleteRequests) and \ (newJob['type'] not in ['LogCollect', "Cleanup"]): continue jobID = newJob['id'] dbJobs.add(jobID) if jobID in self.cachedJobIDs: continue jobCount += 1 if jobCount % 5000 == 0: logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs)) pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl") if not os.path.isfile(pickledJobPath): # Then we have a problem - there's no file logging.error("Could not find pickled jobObject %s", pickledJobPath) badJobs[71103].append(newJob) continue try: jobHandle = open(pickledJobPath, "r") loadedJob = pickle.load(jobHandle) jobHandle.close() except Exception as ex: msg = "Error while loading pickled job object %s\n" % pickledJobPath msg += str(ex) logging.error(msg) raise JobSubmitterPollerException(msg) loadedJob['retry_count'] = newJob['retry_count'] # figure out possible locations for job possibleLocations = loadedJob["possiblePSN"] # Create another set of locations that may change when a site goes white/black listed # Does not care about the non_draining or aborted sites, they may change and that is the point potentialLocations = set() potentialLocations.update(possibleLocations) # now check for sites in drain and adjust the possible locations # also check if there is at least one site left to run the job if len(possibleLocations) == 0: newJob['name'] = loadedJob['name'] newJob['fileLocations'] = loadedJob.get('fileLocations', []) newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', []) newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', []) badJobs[71101].append(newJob) continue else: nonAbortSites = [ x for x in possibleLocations if x not in self.abortSites ] if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job possibleLocations = nonAbortSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71102].append(newJob) continue # try to remove draining sites if possible, this is needed to stop # jobs that could run anywhere blocking draining sites # if the job type is Merge, LogCollect or Cleanup this is skipped if newJob['type'] not in self.ioboundTypes: nonDrainingSites = [ x for x in possibleLocations if x not in self.drainSites ] if nonDrainingSites: # if >1 viable non-draining site remove draining ones possibleLocations = nonDrainingSites else: newJob['name'] = loadedJob['name'] newJob['possibleLocations'] = possibleLocations badJobs[71104].append(newJob) continue # locations clear of abort and draining sites newJob['possibleLocations'] = possibleLocations batchDir = self.addJobsToPackage(loadedJob) self.cachedJobIDs.add(jobID) # calculate the final job priority such that we can order cached jobs by prio jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority'] if jobPrio not in self.cachedJobs: self.cachedJobs[jobPrio] = {} # now add basic information keyed by the jobid self.cachedJobs[jobPrio][jobID] = newJob # allow job baggage to override numberOfCores # => used for repacking to get more slots/disk numberOfCores = loadedJob.get('numberOfCores', 1) if numberOfCores == 1: baggage = loadedJob.getBaggage() numberOfCores = getattr(baggage, "numberOfCores", 1) loadedJob['numberOfCores'] = numberOfCores # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob) jobInfo = { 'id': jobID, 'requestName': newJob['request_name'], 'taskName': newJob['task_name'], 'taskType': newJob['type'], 'cache_dir': newJob["cache_dir"], 'priority': newJob['wf_priority'], 'taskID': newJob['task_id'], 'retry_count': newJob["retry_count"], 'taskPriority': None, # update from the thresholds 'custom': { 'location': None }, # update later 'packageDir': batchDir, 'sandbox': loadedJob["sandbox"], # remove before submit 'userdn': loadedJob.get("ownerDN", None), 'usergroup': loadedJob.get("ownerGroup", ''), 'userrole': loadedJob.get("ownerRole", ''), 'possibleSites': frozenset( possibleLocations), # abort and drain sites filtered out 'potentialSites': frozenset(potentialLocations), # original list of sites 'scramArch': loadedJob.get("scramArch", None), 'swVersion': loadedJob.get("swVersion", None), 'name': loadedJob["name"], 'proxyPath': loadedJob.get("proxyPath", None), 'estimatedJobTime': loadedJob.get("estimatedJobTime", None), 'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None), 'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None), 'numberOfCores': loadedJob.get("numberOfCores", 1), # may update it later 'inputDataset': loadedJob.get('inputDataset', None), 'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None), 'allowOpportunistic': loadedJob.get('allowOpportunistic', False) } self.jobDataCache[jobID] = jobInfo # Register failures in submission for errorCode in badJobs: if badJobs[errorCode]: logging.debug( "The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode) self._handleSubmitFailedJobs(badJobs[errorCode], errorCode) # If there are any leftover jobs, we want to get rid of them. self.flushJobPackages() # We need to remove any jobs from the cache that were not returned in # the last call to the database. jobIDsToPurge = self.cachedJobIDs - dbJobs self._purgeJobsFromCache(jobIDsToPurge) logging.info("Done pruning killed jobs, moving on to submit.") return def removeAbortedForceCompletedWorkflowFromCache(self): abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData( ) jobIDsToPurge = set() for jobID, jobInfo in self.jobDataCache.iteritems(): if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \ (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]): jobIDsToPurge.add(jobID) self._purgeJobsFromCache(jobIDsToPurge) return def _purgeJobsFromCache(self, jobIDsToPurge): if len(jobIDsToPurge) == 0: return self.cachedJobIDs -= jobIDsToPurge for jobid in jobIDsToPurge: self.jobDataCache.pop(jobid, None) for jobPrio in self.cachedJobs: if self.cachedJobs[jobPrio].pop(jobid, None): # then the jobid was found, go to the next one break return def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath}) except IOError as ioer: logging.error( "Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return def getThresholds(self): """ _getThresholds_ Retrieve submit thresholds, which considers what is pending and running for those sites. Also update the list of draining and abort/down sites. Finally, creates a map between task type and its priority. """ self.taskTypePrioMap = {} newDrainSites = set() newAbortSites = set() rcThresholds = self.resourceControl.listThresholdsForSubmit() for siteName in rcThresholds.keys(): # Add threshold if we don't have it already state = rcThresholds[siteName]["state"] if state == "Draining": newDrainSites.add(siteName) if state in ["Down", "Aborted"]: newAbortSites.add(siteName) # then update the task type x task priority mapping if not self.taskTypePrioMap: for task, value in rcThresholds[siteName]['thresholds'].items( ): self.taskTypePrioMap[task] = value.get( 'priority', 0) * self.maxTaskPriority # When the list of drain/abort sites change between iteration then a location # refresh is needed, for now it forces a full cache refresh if newDrainSites != self.drainSites or newAbortSites != self.abortSites: logging.info( "Draining or Aborted sites have changed, the cache will be rebuilt." ) self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.currentRcThresholds = rcThresholds self.abortSites = newAbortSites self.drainSites = newDrainSites return def _getJobSubmitCondition(self, jobPrio, siteName, jobType): """ returns the string describing whether a job is ready to be submitted or the reason can't be submitted Only jobs with "JobSubmitReady" return value will be added to submit job. Other return values will indicate the reason jobs cannot be submitted. i.e. "NoPendingSlot" - pending slot is full with pending job """ try: totalPendingSlots = self.currentRcThresholds[siteName][ "total_pending_slots"] totalPendingJobs = self.currentRcThresholds[siteName][ "total_pending_jobs"] totalRunningSlots = self.currentRcThresholds[siteName][ "total_running_slots"] totalRunningJobs = self.currentRcThresholds[siteName][ "total_running_jobs"] taskPendingSlots = self.currentRcThresholds[siteName][ 'thresholds'][jobType]["pending_slots"] taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][ jobType]["task_pending_jobs"] taskRunningSlots = self.currentRcThresholds[siteName][ 'thresholds'][jobType]["max_slots"] taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][ jobType]["task_running_jobs"] highestPriorityInJobs = self.currentRcThresholds[siteName][ 'thresholds'][jobType]['wf_highest_priority'] # set the initial totalPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName].setdefault( "init_total_pending_jobs", totalPendingJobs) # set the initial taskPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName]['thresholds'][ jobType].setdefault("init_task_pending_jobs", taskPendingJobs) initTotalPending = self.currentRcThresholds[siteName][ "init_total_pending_jobs"] initTaskPending = self.currentRcThresholds[siteName]['thresholds'][ jobType]["init_task_pending_jobs"] except KeyError as ex: msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType) logging.exception(msg) return "NoJobType_%s_%s" % (siteName, jobType) if (highestPriorityInJobs is None) or ( jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes): # there is no pending or running jobs in the system (None case) or # priority of the job is lower or equal don't allow overflow # Also if jobType is in ioboundTypes don't allow overflow totalPendingThreshold = totalPendingSlots taskPendingThreshold = taskPendingSlots totalJobThreshold = totalPendingSlots + totalRunningSlots totalTaskTheshold = taskPendingSlots + taskRunningSlots else: # In case the priority of the job is higher than any of currently pending or running jobs. # Then increase the threshold by condorOverflowFraction * original pending slot. totalPendingThreshold = max( totalPendingSlots, initTotalPending) + ( totalPendingSlots * self.condorOverflowFraction) taskPendingThreshold = max(taskPendingSlots, initTaskPending) + ( taskPendingSlots * self.condorOverflowFraction) totalJobThreshold = totalPendingThreshold + totalRunningSlots totalTaskTheshold = taskPendingThreshold + taskRunningSlots jobStats = [{ "Condition": "NoPendingSlot", "Current": totalPendingJobs, "Threshold": totalPendingThreshold }, { "Condition": "NoTaskPendingSlot", "Current": taskPendingJobs, "Threshold": taskPendingThreshold }, { "Condition": "NoRunningSlot", "Current": totalPendingJobs + totalRunningJobs, "Threshold": totalJobThreshold }, { "Condition": "NoTaskRunningSlot", "Current": taskPendingJobs + taskRunningJobs, "Threshold": totalTaskTheshold }] return jobSubmitCondition(jobStats) def assignJobLocations(self): """ _assignJobLocations_ Loop through the submit thresholds and pull sites out of the job cache as we discover open slots. This will return a list of tuple where each tuple will have six elements: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory - SE name of the site to run at """ jobsToSubmit = {} jobsToUncache = [] jobsCount = 0 exitLoop = False jobSubmitLogBySites = defaultdict(Counter) jobSubmitLogByPriority = defaultdict(Counter) # iterate over jobs from the highest to the lowest prio for jobPrio in sorted(self.cachedJobs, reverse=True): # then we're completely done and have our basket full of jobs to submit if exitLoop: break # start eating through the elder jobs first for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')): jobid = job['id'] jobType = job['type'] possibleSites = job['possibleLocations'] jobSubmitLogByPriority[jobPrio]['Total'] += 1 # now look for sites with free pending slots for siteName in possibleSites: if siteName not in self.currentRcThresholds: logging.warn( "Have a job for %s which is not in the resource control", siteName) continue condition = self._getJobSubmitCondition( jobPrio, siteName, jobType) if condition != "JobSubmitReady": jobSubmitLogBySites[siteName][condition] += 1 logging.debug("Found a job for %s : %s", siteName, condition) continue # otherwise, update the site/task thresholds and the component job counter self.currentRcThresholds[siteName][ "total_pending_jobs"] += 1 self.currentRcThresholds[siteName]['thresholds'][jobType][ "task_pending_jobs"] += 1 jobsCount += 1 # load (and remove) the job dictionary object from jobDataCache cachedJob = self.jobDataCache.pop(jobid) jobsToUncache.append((jobPrio, jobid)) # Sort jobs by jobPackage package = cachedJob['packageDir'] if package not in jobsToSubmit.keys(): jobsToSubmit[package] = [] # Add the sandbox to a global list self.sandboxPackage[package] = cachedJob.pop('sandbox') # Now update the job dictionary object cachedJob['custom'] = {'location': siteName} cachedJob['taskPriority'] = self.currentRcThresholds[ siteName]['thresholds'][jobType]["priority"] # Get this job in place to be submitted by the plugin jobsToSubmit[package].append(cachedJob) jobSubmitLogBySites[siteName]["submitted"] += 1 jobSubmitLogByPriority[jobPrio]['submitted'] += 1 # found a site to submit this job, so go to the next job break # set the flag and get out of the job iteration if jobsCount >= self.maxJobsThisCycle: logging.info( "Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle) exitLoop = True break # jobs that are going to be submitted must be removed from all caches for prio, jobid in jobsToUncache: self.cachedJobs[prio].pop(jobid) self.cachedJobIDs.remove(jobid) logging.info("Site submission report: %s", dict(jobSubmitLogBySites)) logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority)) logging.info("Have %s packages to submit.", len(jobsToSubmit)) logging.info("Have %s jobs to submit.", jobsCount) logging.info("Done assigning site locations.") return jobsToSubmit def submitJobs(self, jobsToSubmit): """ _submitJobs_ Actually do the submission of the jobs """ jobList = [] idList = [] if len(jobsToSubmit) == 0: logging.debug("There are no packages to submit.") return for package in jobsToSubmit.keys(): sandbox = self.sandboxPackage[package] jobs = jobsToSubmit.get(package, []) for job in jobs: job['location'], job['plugin'], job[ 'site_cms_name'] = self.getSiteInfo( job['custom']['location']) job['sandbox'] = sandbox idList.append({ 'jobid': job['id'], 'location': job['custom']['location'] }) #Clean out the package reference del self.sandboxPackage[package] jobList.extend(jobs) myThread = threading.currentThread() myThread.transaction.begin() # Run the actual underlying submit code using bossAir successList, failList = self.bossAir.submit(jobs=jobList) logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList)) # Propagate states in the WMBS database logging.debug("Propagating success state to WMBS.") self.changeState.propagate(successList, 'executing', 'created') logging.debug("Propagating fail state to WMBS.") self.changeState.propagate(failList, 'submitfailed', 'created') # At the end we mark the locations of the jobs # This applies even to failed jobs, since the location # could be part of the failure reason. logging.debug("Updating job location...") self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() logging.info("Transaction cycle successfully completed.") return def getSiteInfo(self, jobSite): """ _getSiteInfo_ This is how you get the name of a CE and the plugin for a job """ if not jobSite in self.locationDict.keys(): siteInfo = self.locationAction.execute(siteName=jobSite) self.locationDict[jobSite] = siteInfo[0] return (self.locationDict[jobSite].get('ce_name'), self.locationDict[jobSite].get('plugin'), self.locationDict[jobSite].get('cms_name')) @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Try to, in order: 1) Refresh the cache 2) Find jobs for all the necessary sites 3) Submit the jobs to the plugin """ myThread = threading.currentThread() if self.useReqMgrForCompletionCheck: # only runs when reqmgr is used (not Tier0) self.removeAbortedForceCompletedWorkflowFromCache() agentConfig = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName) self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75) self.condorOverflowFraction = agentConfig.get( "CondorOverflowFraction", 0.2) else: # For Tier0 agent self.condorFraction = 1 self.condorOverflowFraction = 0 if not self.passSubmitConditions(): msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle." logging.warning(msg) myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning") return try: myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True) self.getThresholds() self.refreshCache() jobsToSubmit = self.assignJobLocations() self.submitJobs(jobsToSubmit=jobsToSubmit) except WMException: if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise except Exception as ex: msg = 'Fatal error in JobSubmitter:\n' msg += str(ex) #msg += str(traceback.format_exc()) msg += '\n\n' logging.error(msg) if getattr(myThread, 'transaction', None) != None: myThread.transaction.rollback() raise JobSubmitterPollerException(msg) return def passSubmitConditions(self): """ _passSubmitConditions_ Check whether the component is allowed to submit jobs to condor. Initially it has only one condition, which is the total number of jobs we can have in condor (pending + running) per schedd, set by MAX_JOBS_PER_OWNER. """ myThread = threading.currentThread() freeSubmitSlots = availableScheddSlots( dbi=myThread.dbi, logger=logging, condorFraction=self.condorFraction) self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll) return (self.maxJobsThisCycle > 0) def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
class JobSubmitterPoller(BaseWorkerThread): """ _JobSubmitterPoller_ The jobSubmitterPoller takes the jobs and organizes them into packages before sending them to the individual plugin submitters. """ def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config # DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) # Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config, insertStates=True) self.hostName = self.config.Agent.hostName self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.maxJobsToCache = int(getattr(self.config.JobSubmitter, 'maxJobsToCache', 50000)) self.maxJobsThisCycle = self.maxJobsPerPoll # changes as per schedd limit self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) self.condorFraction = 0.75 # update during every algorithm cycle self.condorOverflowFraction = 0.2 self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting') self.drainGracePeriod = getattr(self.config.JobSubmitter, 'drainGraceTime', 2 * 24 * 60 * 60) # 2 days # Used for speed draining the agent self.enableAllSites = False # Additions for caching-based JobSubmitter self.jobsByPrio = {} # key'ed by the final job priority, which contains a set of job ids self.jobDataCache = {} # key'ed by the job id, containing the whole job info dict self.jobsToPackage = {} self.locationDict = {} self.drainSites = dict() self.drainSitesSet = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache() self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return def getPackageCollection(self, sandboxDir): """ _getPackageCollection_ Given a jobID figure out which packageCollection it should belong in. """ rawList = os.listdir(sandboxDir) collections = [] numberList = [] for entry in rawList: if 'PackageCollection' in entry: collections.append(entry) # If we have no collections, return 0 (PackageCollection_0) if len(collections) < 1: return 0 # Loop over the list of PackageCollections for collection in collections: collectionPath = os.path.join(sandboxDir, collection) packageList = os.listdir(collectionPath) collectionNum = int(collection.split('_')[1]) if len(packageList) < self.collSize: return collectionNum else: numberList.append(collectionNum) # If we got here, then all collections are full. We'll need # a new one. Find the highest number, increment by one numberList.sort() return numberList[-1] + 1 def addJobsToPackage(self, loadedJob): """ _addJobsToPackage_ Add a job to a job package and then return the batch ID for the job. Packages are only written out to disk when they contain 100 jobs. The flushJobsPackages() method must be called after all jobs have been added to the cache and before they are actually submitted to make sure all the job packages have been written to disk. """ if loadedJob["workflow"] not in self.jobsToPackage: # First, let's pull all the information from the loadedJob batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"]) sandboxDir = os.path.dirname(loadedJob["sandbox"]) # Second, assemble the jobPackage location collectionIndex = self.getPackageCollection(sandboxDir) collectionDir = os.path.join(sandboxDir, 'PackageCollection_%i' % collectionIndex, 'batch_%s' % batchid) # Now create the package object self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid, 'id': loadedJob['id'], "package": JobPackage(directory=collectionDir)} jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"] jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob() batchDir = jobPackage['directory'] if len(jobPackage.keys()) == self.packageSize: if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[loadedJob["workflow"]] return batchDir def flushJobPackages(self): """ _flushJobPackages_ Write any jobs packages to disk that haven't been written out already. """ workflowNames = self.jobsToPackage.keys() for workflowName in workflowNames: jobPackage = self.jobsToPackage[workflowName]["package"] batchDir = jobPackage['directory'] if not os.path.exists(batchDir): os.makedirs(batchDir) batchPath = os.path.join(batchDir, "JobPackage.pkl") jobPackage.save(batchPath) del self.jobsToPackage[workflowName] return def hasToRefreshCache(self): """ _hasToRefreshCache_ Check whether we should update the job data cache (or update it with new jobs in the created state) or if we just skip it. """ if self.cacheRefreshSize == -1 or len(self.jobDataCache) < self.cacheRefreshSize or\ self.refreshPollingCount >= self.skipRefreshCount: self.refreshPollingCount = 0 return True else: self.refreshPollingCount += 1 logging.info("Skipping cache update to be submitted. (%s job in cache)", len(self.jobDataCache)) return False def refreshCache(self): """ _refreshCache_ Query WMBS for all jobs in the 'created' state. For all jobs returned from the query, check if they already exist in the cache. If they don't, unpickle them and combine their site white and black list with the list of locations they can run at. Add them to the cache. Each entry in the cache is a tuple with five items: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory """ # make a counter for jobs pending to sites in drain mode within the grace period countDrainingJobs = 0 timeNow = int(time.time()) badJobs = dict([(x, []) for x in range(71101, 71105)]) newJobIds = set() logging.info("Refreshing priority cache with currently %i jobs", len(self.jobDataCache)) newJobs = self.listJobsAction.execute(limitRows=self.maxJobsToCache) if self.useReqMgrForCompletionCheck: # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() else: abortedAndForceCompleteRequests = [] logging.info("Found %s new jobs to be submitted.", len(newJobs)) if self.enableAllSites: logging.info("Agent is in speed drain mode. Submitting jobs to all possible locations.") logging.info("Determining possible sites for new jobs...") jobCount = 0 for newJob in newJobs: jobCount += 1 if jobCount % 5000 == 0: logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs)) # whether newJob belongs to aborted or force-complete workflow, and skip it if it is. if newJob['request_name'] in abortedAndForceCompleteRequests and \ newJob['task_type'] not in ['LogCollect', "Cleanup"]: continue jobID = newJob['id'] newJobIds.add(jobID) if jobID in self.jobDataCache: continue pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl") if not os.path.isfile(pickledJobPath): # Then we have a problem - there's no file logging.error("Could not find pickled jobObject %s", pickledJobPath) badJobs[71103].append(newJob) continue try: with open(pickledJobPath, 'r') as jobHandle: loadedJob = pickle.load(jobHandle) except Exception as ex: msg = "Error while loading pickled job object %s\n" % pickledJobPath msg += str(ex) logging.error(msg) raise JobSubmitterPollerException(msg) # figure out possible locations for job possibleLocations = loadedJob["possiblePSN"] # Create another set of locations that may change when a site goes white/black listed # Does not care about the non_draining or aborted sites, they may change and that is the point potentialLocations = set() potentialLocations.update(possibleLocations) # check if there is at least one site left to run the job if len(possibleLocations) == 0: newJob['fileLocations'] = loadedJob.get('fileLocations', []) newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', []) newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', []) logging.warning("Input data location doesn't pass the site restrictions for job id: %s", jobID) badJobs[71101].append(newJob) continue # if agent is in speed drain and has hit the threshold to submit to all sites, we can skip the logic below that exclude sites if not self.enableAllSites: # check for sites in aborted state and adjust the possible locations nonAbortSites = [x for x in possibleLocations if x not in self.abortSites] if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job possibleLocations = nonAbortSites else: newJob['possibleSites'] = possibleLocations logging.warning("Job id %s can only run at a site in Aborted state", jobID) badJobs[71102].append(newJob) continue # try to remove draining sites if possible, this is needed to stop # jobs that could run anywhere blocking draining sites # if the job type is Merge, LogCollect or Cleanup this is skipped if newJob['task_type'] not in self.ioboundTypes: nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites] if nonDrainingSites: # if >1 viable non-draining site remove draining ones possibleLocations = nonDrainingSites elif self.failJobDrain(timeNow, possibleLocations): newJob['possibleSites'] = possibleLocations logging.warning("Job id %s can only run at a sites in Draining state", jobID) badJobs[71104].append(newJob) continue else: countDrainingJobs += 1 continue # Sigh...make sure the job added to the package has the proper retry_count loadedJob['retry_count'] = newJob['retry_count'] batchDir = self.addJobsToPackage(loadedJob) # calculate the final job priority such that we can order cached jobs by prio jobPrio = newJob['task_prio'] * self.maxTaskPriority + newJob['wf_priority'] self.jobsByPrio.setdefault(jobPrio, set()) self.jobsByPrio[jobPrio].add(jobID) # allow job baggage to override numberOfCores # => used for repacking to get more slots/disk numberOfCores = loadedJob.get('numberOfCores', 1) if numberOfCores == 1: baggage = loadedJob.getBaggage() numberOfCores = getattr(baggage, "numberOfCores", 1) loadedJob['numberOfCores'] = numberOfCores # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob) jobInfo = {'taskPriority': newJob['task_prio'], 'custom': {'location': None}, # update later 'packageDir': batchDir, 'retry_count': newJob["retry_count"], 'sandbox': loadedJob["sandbox"], # remove before submit 'userdn': loadedJob.get("ownerDN", None), 'usergroup': loadedJob.get("ownerGroup", ''), 'userrole': loadedJob.get("ownerRole", ''), 'possibleSites': frozenset(possibleLocations), # abort and drain sites filtered out 'potentialSites': frozenset(potentialLocations), # original list of sites 'scramArch': loadedJob.get("scramArch", None), 'swVersion': loadedJob.get("swVersion", []), 'proxyPath': loadedJob.get("proxyPath", None), 'estimatedJobTime': loadedJob.get("estimatedJobTime", None), 'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None), 'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None), 'numberOfCores': loadedJob.get("numberOfCores"), # may update it later 'inputDataset': loadedJob.get('inputDataset', None), 'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None), 'inputPileup': loadedJob.get('inputPileup', None), 'allowOpportunistic': loadedJob.get('allowOpportunistic', False)} # then update it with the info retrieved from the database jobInfo.update(newJob) self.jobDataCache[jobID] = jobInfo # Register failures in submission for errorCode in badJobs: if badJobs[errorCode]: logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode) self._handleSubmitFailedJobs(badJobs[errorCode], errorCode) # Persist remaining job packages to disk self.flushJobPackages() # We need to remove any jobs from the cache that were not returned in # the last call to the database. jobIDsToPurge = set(self.jobDataCache.keys()) - newJobIds self._purgeJobsFromCache(jobIDsToPurge) logging.info("Found %d jobs pending to sites in drain within the grace period", countDrainingJobs) logging.info("Done pruning killed jobs, moving on to submit.") return def failJobDrain(self, timeNow, possibleLocations): """ Check whether sites are in drain for too long such that the job has to be marked as failed or not. :param timeNow: timestamp for this cycle :param possibleLocations: list of possible locations where the job can run :return: a boolean saying whether the job has to fail or not """ fail = True for siteName in set(possibleLocations).union(self.drainSitesSet): if timeNow - self.drainSites[siteName] < self.drainGracePeriod: # then let this job be, it's a fresh draining site fail = False break return fail def removeAbortedForceCompletedWorkflowFromCache(self): abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData() jobIDsToPurge = set() for jobID, jobInfo in self.jobDataCache.iteritems(): if (jobInfo['request_name'] in abortedAndForceCompleteRequests) and \ (jobInfo['task_type'] not in ['LogCollect', "Cleanup"]): jobIDsToPurge.add(jobID) self._purgeJobsFromCache(jobIDsToPurge) return def _purgeJobsFromCache(self, jobIDsToPurge): if len(jobIDsToPurge) == 0: return for jobid in jobIDsToPurge: self.jobDataCache.pop(jobid, None) for jobPrio in self.jobsByPrio: if jobid in self.jobsByPrio[jobPrio]: # then the jobid was found, go to the next one self.jobsByPrio[jobPrio].discard(jobid) break return def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleSites']), ', '.join(job['possibleSites'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', and empty fileLocations') else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath}) except IOError as ioer: logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return def getThresholds(self): """ _getThresholds_ Retrieve submit thresholds, which considers what is pending and running for those sites. Also update the list of draining and abort/down sites. Finally, creates a map between task type and its priority. """ # lets store also a timestamp for when a site joined the Drain state newDrainSites = dict() newAbortSites = set() rcThresholds = self.resourceControl.listThresholdsForSubmit() for siteName in rcThresholds.keys(): # Add threshold if we don't have it already state = rcThresholds[siteName]["state"] if state == "Draining": newDrainSites.update({siteName: rcThresholds[siteName]["state_time"]}) if state in ["Down", "Aborted"]: newAbortSites.add(siteName) # When the list of drain/abort sites change between iteration then a location # refresh is needed, for now it forces a full cache refresh if set(newDrainSites.keys()) != self.drainSitesSet or newAbortSites != self.abortSites: logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.") self.jobsByPrio = {} self.jobDataCache = {} self.currentRcThresholds = rcThresholds self.abortSites = newAbortSites self.drainSites = newDrainSites self.drainSitesSet = set(newDrainSites.keys()) return def checkZeroTaskThresholds(self, jobType, siteList): """ _checkZeroTaskThresholds_ Given a job type and a list of sites, remove sites from the list if that site + task has 0 pending thresholds. Returns a new list of sites """ newSiteList = [] for site in siteList: try: taskPendingSlots = self.currentRcThresholds[site]['thresholds'][jobType]["pending_slots"] except KeyError as ex: msg = "Invalid key for site %s and job type %s. Error: %s" % (site, jobType, str(ex)) logging.warning(msg) else: if taskPendingSlots > 0: newSiteList.append(site) return newSiteList def _getJobSubmitCondition(self, jobPrio, siteName, jobType): """ returns the string describing whether a job is ready to be submitted or the reason can't be submitted Only jobs with "JobSubmitReady" return value will be added to submit job. Other return values will indicate the reason jobs cannot be submitted. i.e. "NoPendingSlot" - pending slot is full with pending job """ try: totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"] totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"] totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"] totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"] taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"] taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"] taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"] highestPriorityInJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]['wf_highest_priority'] # set the initial totalPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName].setdefault("init_total_pending_jobs", totalPendingJobs) # set the initial taskPendingJobs since it increases in every cycle when a job is submitted self.currentRcThresholds[siteName]['thresholds'][jobType].setdefault("init_task_pending_jobs", taskPendingJobs) initTotalPending = self.currentRcThresholds[siteName]["init_total_pending_jobs"] initTaskPending = self.currentRcThresholds[siteName]['thresholds'][jobType]["init_task_pending_jobs"] except KeyError as ex: msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType) msg += str(ex) logging.exception(msg) return "NoJobType_%s_%s" % (siteName, jobType) if (highestPriorityInJobs is None) or (jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes): # there is no pending or running jobs in the system (None case) or # priority of the job is lower or equal don't allow overflow # Also if jobType is in ioboundTypes don't allow overflow totalPendingThreshold = totalPendingSlots taskPendingThreshold = taskPendingSlots totalJobThreshold = totalPendingSlots + totalRunningSlots totalTaskTheshold = taskPendingSlots + taskRunningSlots else: # In case the priority of the job is higher than any of currently pending or running jobs. # Then increase the threshold by condorOverflowFraction * original pending slot. totalPendingThreshold = max(totalPendingSlots, initTotalPending) + ( totalPendingSlots * self.condorOverflowFraction) taskPendingThreshold = max(taskPendingSlots, initTaskPending) + ( taskPendingSlots * self.condorOverflowFraction) totalJobThreshold = totalPendingThreshold + totalRunningSlots totalTaskTheshold = taskPendingThreshold + taskRunningSlots jobStats = [{"Condition": "NoPendingSlot", "Current": totalPendingJobs, "Threshold": totalPendingThreshold}, {"Condition": "NoTaskPendingSlot", "Current": taskPendingJobs, "Threshold": taskPendingThreshold}, {"Condition": "NoRunningSlot", "Current": totalPendingJobs + totalRunningJobs, "Threshold": totalJobThreshold}, {"Condition": "NoTaskRunningSlot", "Current": taskPendingJobs + taskRunningJobs, "Threshold": totalTaskTheshold}] return jobSubmitCondition(jobStats) def assignJobLocations(self): """ _assignJobLocations_ Loop through the submit thresholds and pull sites out of the job cache as we discover open slots. This will return a list of tuple where each tuple will have six elements: - WMBS Job ID - Retry count - Batch ID - Path to sanbox - Path to cache directory - SE name of the site to run at """ jobsToSubmit = {} jobsCount = 0 exitLoop = False jobSubmitLogBySites = defaultdict(lambda: defaultdict(Counter)) jobSubmitLogByPriority = defaultdict(lambda: defaultdict(Counter)) # iterate over jobs from the highest to the lowest prio for jobPrio in sorted(self.jobsByPrio, reverse=True): # then we're completely done and have our basket full of jobs to submit if exitLoop: break # can we assume jobid=1 is older than jobid=3? I think so... for jobid in sorted(self.jobsByPrio[jobPrio]): jobType = self.jobDataCache[jobid]['task_type'] possibleSites = self.jobDataCache[jobid]['possibleSites'] # remove sites with 0 task thresholds possibleSites = self.checkZeroTaskThresholds(jobType, possibleSites) jobSubmitLogByPriority[jobPrio][jobType]['Total'] += 1 # now look for sites with free pending slots for siteName in possibleSites: condition = self._getJobSubmitCondition(jobPrio, siteName, jobType) if condition != "JobSubmitReady": jobSubmitLogBySites[siteName][jobType][condition] += 1 logging.debug("Found a job for %s : %s", siteName, condition) continue # pop the job dictionary object and update it cachedJob = self.jobDataCache.pop(jobid) cachedJob['custom'] = {'location': siteName} cachedJob['possibleSites'] = possibleSites # Sort jobs by jobPackage and get it in place to be submitted by the plugin package = cachedJob['packageDir'] jobsToSubmit.setdefault(package, []) jobsToSubmit[package].append(cachedJob) # update site/task thresholds and the component job counter self.currentRcThresholds[siteName]["total_pending_jobs"] += 1 self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1 jobsCount += 1 jobSubmitLogBySites[siteName][jobType]["submitted"] += 1 jobSubmitLogByPriority[jobPrio][jobType]['submitted'] += 1 # jobs that will be submitted must leave the job data cache self.jobsByPrio[jobPrio].discard(jobid) # found a site to submit this job, so go to the next job break # set the flag and get out of the job iteration if jobsCount >= self.maxJobsThisCycle: logging.info("Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle) exitLoop = True break logging.info("Site submission report ...") for site in jobSubmitLogBySites: logging.info(" %s : %s", site, json.dumps(jobSubmitLogBySites[site])) logging.info("Priority submission report ...") for prio in jobSubmitLogByPriority: logging.info(" %s : %s", prio, json.dumps(jobSubmitLogByPriority[prio])) logging.info("Have %s packages to submit.", len(jobsToSubmit)) logging.info("Have %s jobs to submit.", jobsCount) logging.info("Done assigning site locations.") return jobsToSubmit def submitJobs(self, jobsToSubmit): """ _submitJobs_ Actually do the submission of the jobs """ jobList = [] idList = [] if len(jobsToSubmit) == 0: logging.debug("There are no packages to submit.") return for package in jobsToSubmit.keys(): jobs = jobsToSubmit.get(package, []) for job in jobs: job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location']) idList.append({'jobid': job['id'], 'location': job['custom']['location']}) jobList.extend(jobs) myThread = threading.currentThread() myThread.transaction.begin() # Run the actual underlying submit code using bossAir successList, failList = self.bossAir.submit(jobs=jobList) logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList)) # Propagate states in the WMBS database logging.debug("Propagating success state to WMBS.") self.changeState.propagate(successList, 'executing', 'created') logging.debug("Propagating fail state to WMBS.") self.changeState.propagate(failList, 'submitfailed', 'created') # At the end we mark the locations of the jobs # This applies even to failed jobs, since the location # could be part of the failure reason. logging.debug("Updating job location...") self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn, transaction=True) myThread.transaction.commit() logging.info("Transaction cycle successfully completed.") return def getSiteInfo(self, jobSite): """ _getSiteInfo_ This is how you get the name of a CE and the plugin for a job """ if jobSite not in self.locationDict.keys(): siteInfo = self.locationAction.execute(siteName=jobSite) self.locationDict[jobSite] = siteInfo[0] return (self.locationDict[jobSite].get('ce_name'), self.locationDict[jobSite].get('plugin'), self.locationDict[jobSite].get('cms_name')) @timeFunction def algorithm(self, parameters=None): """ _algorithm_ Try to, in order: 1) Refresh the cache 2) Find jobs for all the necessary sites 3) Submit the jobs to the plugin """ myThread = threading.currentThread() if self.useReqMgrForCompletionCheck: # only runs when reqmgr is used (not Tier0) self.removeAbortedForceCompletedWorkflowFromCache() agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) if agentConfig.get("UserDrainMode") and agentConfig.get("SpeedDrainMode"): self.enableAllSites = agentConfig.get("SpeedDrainConfig")['EnableAllSites']['Enabled'] else: self.enableAllSites = False self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75) self.condorOverflowFraction = agentConfig.get("CondorOverflowFraction", 0.2) else: # For Tier0 agent self.condorFraction = 1 self.condorOverflowFraction = 0 if not self.passSubmitConditions(): msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle." logging.warning(msg) myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning") return try: myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True) self.getThresholds() if self.hasToRefreshCache(): self.refreshCache() jobsToSubmit = self.assignJobLocations() self.submitJobs(jobsToSubmit=jobsToSubmit) except WMException: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() raise except Exception as ex: msg = 'Fatal error in JobSubmitter:\n' msg += str(ex) # msg += str(traceback.format_exc()) msg += '\n\n' logging.error(msg) if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() raise JobSubmitterPollerException(msg) return def passSubmitConditions(self): """ _passSubmitConditions_ Check whether the component is allowed to submit jobs to condor. Initially it has only one condition, which is the total number of jobs we can have in condor (pending + running) per schedd, set by MAX_JOBS_PER_OWNER. """ myThread = threading.currentThread() freeSubmitSlots = availableScheddSlots(dbi=myThread.dbi, logger=logging, condorFraction=self.condorFraction) self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll) return (self.maxJobsThisCycle > 0) def terminate(self, params): """ _terminate_ Kill the code after one final pass when called by the master thread. """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params)
def testA_APITest(self): """ _APITest_ This is a commissioning test that has very little to do with anything except loading the code. """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # We should have loaded a plugin self.assertTrue('TestPlugin' in baAPI.plugins.keys()) result = myThread.dbi.processData( "SELECT name FROM bl_status")[0].fetchall() statusList = [] for i in result: statusList.append(i.values()[0]) # We should have the plugin states in the database self.assertEqual(statusList.sort(), ['New', 'Dead', 'Gone'].sort()) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs) print(jobDummies) baAPI.createNewJobs(wmbsJobs=jobDummies) runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) raisesException = False self.assertRaises(BossAirException, baAPI._loadByStatus, status='FalseStatus') # Change the job status and update it for job in newJobs: job['status'] = 'Dead' baAPI._updateJobs(jobs=newJobs) # Test whether we see the job status as updated newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), nJobs) # Can we load by BossAir ID? loadedJobs = baAPI._loadByID(jobs=deadJobs) self.assertEqual(len(loadedJobs), nJobs) # Can we load via WMBS? loadedJobs = baAPI.loadByWMBS(wmbsJobs=jobDummies) self.assertEqual(len(loadedJobs), nJobs) # See if we can delete jobs baAPI._deleteJobs(jobs=deadJobs) # Confirm that they're gone deadJobs = baAPI._loadByStatus(status='Dead') self.assertEqual(len(deadJobs), 0) self.assertEqual(len(baAPI.jobs), 0) return
def testH_ARCTest(self): """ _ARCTest_ This test works on the ARCPlugin, checking all of its functions with a single set of jobs """ nRunning = getNArcJobs() self.assertEqual(nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginNames.append("ARCPlugin") #config.BossAir.pluginNames = ["ARCPlugin"] baAPI = BossAirAPI(config = config) nJobs = 2 jobDummies = self.createDummyJobs(nJobs = nJobs, location = 'jade-cms.hip.fi') #baAPI.createNewJobs(wmbsJobs = jobDummies) #changeState = ChangeState(config) #changeState.propagate(jobDummies, 'created', 'new') #changeState.propagate(jobDummies, 'executing', 'created') jobPackage = os.path.join(self.testDir, 'JobPackage.pkl') f = open(jobPackage, 'w') f.write(' ') f.close() sandbox = os.path.join(self.testDir, 'sandbox.box') f = open(sandbox, 'w') f.write(' ') f.close() jobList = [] for j in jobDummies: job = j # {'id': j['id']} job['custom'] = {'location': 'jade-cms.hip.fi'} job['location'] = 'jade-cms.hip.fi' job['plugin'] = 'ARCPlugin' job['name'] = j['name'] job['cache_dir'] = self.testDir job['retry_count'] = 0 job['owner'] = 'edelmann' job['packageDir'] = self.testDir job['sandbox'] = sandbox job['priority'] = None jobList.append(job) baAPI.submit(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 for j in rJobs: if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) #if baAPI.plugins['ARCPlugin'].stateDict[j['status']] in [ "Pending", "Running" ]: baAPI.kill(jobs = jobList) nRunning = getNArcJobs() self.assertEqual(nRunning, 0) # Try resubmission for j in jobList: j['retry_count'] = 1 succ, fail = baAPI.submit(jobs = jobList) time.sleep(30) nRunning = getNArcJobs() self.assertEqual(nRunning, nJobs) newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), nJobs) # See where they are baAPI.track() newJobs = baAPI._loadByStatus(status = 'New') self.assertEqual(len(newJobs), 0) rJobs = baAPI._listRunJobs() nOldJobs = 0 idStr = "" for j in rJobs: idStr += " " + j['gridid'] if j['status'] != "New": nOldJobs += 1 self.assertEqual(nOldJobs, nJobs) # Now kill 'em manually no_jobs = True while no_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngkill -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") >= 0: # It seems the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: no_jobs = False # Just to be sure, if the jobs were already finished, do a # 'ngclean' too. command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngclean -t 180 -a' pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] # Make sure the killing of the jobs reaches the info.sys. still_jobs = True while still_jobs: command = 'LD_LIBRARY_PATH=$CLEAN_LD_LIBRARY_PATH ngstat -t 180 ' + idStr pipe = Popen(command, stdout = PIPE, stderr = STDOUT, shell = True) output = pipe.communicate()[0] if output.find("Job information not found") < 0: # It seems the killing of the jobs hasn't reached the ARC info.sys yet. # Sleep a while and try again time.sleep(20) continue else: still_jobs = False # See what happened baAPI.track() idJobs = baAPI._loadByID(rJobs) nActiveJobs = 0 nRemovedJobs = 0 for j in idJobs: if j['status'] not in [ "New", "KILLING", "KILLED", "LOST" ]: nActiveJobs += 1 if j['status'] in [ "KILLING", "KILLED", "LOST" ]: nRemovedJobs += 1 self.assertEqual(nActiveJobs, 0) self.assertEqual(nRemovedJobs, nJobs) return
def __init__(self, config): BaseWorkerThread.__init__(self) myThread = threading.currentThread() self.config = config #DAO factory for WMBS objects self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi) #Libraries self.resourceControl = ResourceControl() self.changeState = ChangeState(self.config) self.bossAir = BossAirAPI(config=self.config) self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000) self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000)) self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000)) self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20)) self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500) self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000) self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7) # Additions for caching-based JobSubmitter self.cachedJobIDs = set() self.cachedJobs = {} self.jobDataCache = {} self.jobsToPackage = {} self.sandboxPackage = {} self.locationDict = {} self.taskTypePrioMap = {} self.drainSites = set() self.abortSites = set() self.refreshPollingCount = 0 try: if not getattr(self.config.JobSubmitter, 'submitDir', None): self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages') if not os.path.exists(self.packageDir): os.makedirs(self.packageDir) except OSError as ex: msg = "Error while trying to create packageDir %s\n!" msg += str(ex) logging.error(msg) logging.debug("PackageDir: %s", self.packageDir) logging.debug("Config: %s", config) raise JobSubmitterPollerException(msg) # Now the DAOs self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter") self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation") self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo") self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath") self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter") # Keep a record of the thresholds in memory self.currentRcThresholds = {} self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True) if self.useReqMgrForCompletionCheck: # only set up this when reqmgr is used (not Tier0) self.reqmgr2Svc = ReqMgr(self.config.TaskArchiver.ReqMgr2ServiceURL) self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache() else: # Tier0 Case - just for the clarity (This private variable shouldn't be used self.abortedAndForceCompleteWorkflowCache = None return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return
def testB_PluginTest(self): """ _PluginTest_ Now check that these functions worked if called through plugins Instead of directly. There are only three plugin """ #return myThread = threading.currentThread() config = self.getConfig() baAPI = BossAirAPI(config=config) # Create some jobs nJobs = 10 jobDummies = self.createDummyJobs(nJobs=nJobs, location='Xanadu') changeState = ChangeState(config) changeState.propagate(jobDummies, 'created', 'new') changeState.propagate(jobDummies, 'executing', 'created') # Prior to building the job, each job must have a plugin # and user assigned for job in jobDummies: job['plugin'] = 'TestPlugin' job['owner'] = 'tapas' baAPI.submit(jobs=jobDummies) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nJobs) # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), nJobs) # Test Plugin should complete all jobs baAPI.track() # Should be no more running jobs runningJobs = baAPI._listRunJobs() self.assertEqual(len(runningJobs), 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nJobs) # Do this test because BossAir is specifically built # to keep it from finding completed jobs result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), nJobs) baAPI.removeComplete(jobs=jobDummies) result = myThread.dbi.processData( "SELECT id FROM bl_runjob")[0].fetchall() self.assertEqual(len(result), 0) return
def testD_PrototypeChain(self): """ _PrototypeChain_ Prototype the BossAir workflow """ dummymyThread = threading.currentThread() nRunning = getCondorRunningJobs(self.user) self.assertEqual( nRunning, 0, "User currently has %i running jobs. Test will not continue" % (nRunning)) config = self.getConfig() config.BossAir.pluginName = 'SimpleCondorPlugin' baAPI = BossAirAPI(config=config, insertStates=True) workload = self.createTestWorkload() workloadName = "basicWorkload" changeState = ChangeState(config) nSubs = 5 nJobs = 10 dummycacheDir = os.path.join(self.testDir, 'CacheDir') jobGroupList = self.createJobGroups(nSubs=nSubs, nJobs=nJobs, task=workload.getTask("ReReco"), workloadSpec=os.path.join( self.testDir, 'workloadTest', workloadName), site='se.T2_US_UCSD') for group in jobGroupList: changeState.propagate(group.jobs, 'created', 'new') jobSubmitter = JobSubmitterPoller(config=config) jobTracker = JobTrackerPoller(config=config) statusPoller = StatusPoller(config=config) jobSubmitter.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), nSubs * nJobs) # Check WMBS getJobsAction = self.daoFactory(classname="Jobs.GetAllJobs") result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) statusPoller.algorithm() nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, nSubs * nJobs) newJobs = baAPI._loadByStatus(status='New') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), nSubs * nJobs) # Tracker should do nothing jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) # Wait for jobs to timeout due to short Pending wait period time.sleep(12) statusPoller.algorithm() newJobs = baAPI._loadByStatus(status='Idle') self.assertEqual(len(newJobs), 0) newJobs = baAPI._loadByStatus(status='Timeout', complete='0') self.assertEqual(len(newJobs), nSubs * nJobs) # Jobs should be gone nRunning = getCondorRunningJobs(self.user) self.assertEqual(nRunning, 0) # Check if they're complete completeJobs = baAPI.getComplete() self.assertEqual(len(completeJobs), nSubs * nJobs) # Because they timed out, they all should have failed jobTracker.algorithm() result = getJobsAction.execute(state='Executing', jobType="Processing") self.assertEqual(len(result), 0) result = getJobsAction.execute(state='JobFailed', jobType="Processing") self.assertEqual(len(result), nSubs * nJobs) return