def testJobKilling(self): """ _testJobKilling_ Test that we can successfully set jobs to the killed state """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname = "Locations.New") locationAction.execute("site1", seName = "somese.cern.ch") testWorkflow = Workflow(spec = "spec.xml", owner = "Steve", name = "wf001", task = self.taskName) testWorkflow.create() testFileset = Fileset(name = "TestFileset") testFileset.create() for i in range(4): newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, split_algo = "FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package = "WMCore.WMBS", subscription = testSubscription) jobGroup = jobFactory(files_per_job = 1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA], "created", "new") change.persist([testJobB], "jobfailed", "executing") change.persist([testJobC, testJobD], "executing", "created") change.persist([testJobA], "killed", "created") change.persist([testJobB], "killed", "jobfailed") change.persist([testJobC, testJobD], "killed", "executing") for job in [testJobA, testJobB, testJobC, testJobD]: job.load() self.assertEqual(job['retry_count'], 99999) self.assertEqual(job['state'], 'killed') return
def testPersist(self): """ _testPersist_ This is the test class for function Propagate from module ChangeState """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname = "Locations.New") locationAction.execute("site1", seName = "somese.cern.ch") testWorkflow = Workflow(spec = "spec.xml", owner = "Steve", name = "wf001", task = self.taskName) testWorkflow.create() testFileset = Fileset(name = "TestFileset") testFileset.create() for i in range(4): newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, split_algo = "FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package = "WMCore.WMBS", subscription = testSubscription) jobGroup = jobFactory(files_per_job = 1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA, testJobB], "created", "new") change.persist([testJobC, testJobD], "new", "none") stateDAO = self.daoFactory(classname = "Jobs.GetState") jobAState = stateDAO.execute(id = testJobA["id"]) jobBState = stateDAO.execute(id = testJobB["id"]) jobCState = stateDAO.execute(id = testJobC["id"]) jobDState = stateDAO.execute(id = testJobD["id"]) assert jobAState == "created" and jobBState =="created" and \ jobCState == "new" and jobDState == "new", \ "Error: Jobs didn't change state correctly." return
def testRetryCount(self): """ _testRetryCount_ Verify that the retry count is incremented when we move out of the submitcooloff or jobcooloff state. """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname = "Locations.New") locationAction.execute("site1", seName = "somese.cern.ch") testWorkflow = Workflow(spec = "spec.xml", owner = "Steve", name = "wf001", task = self.taskName) testWorkflow.create() testFileset = Fileset(name = "TestFileset") testFileset.create() for i in range(4): newFile = File(lfn = "File%s" % i, locations = set(["somese.cern.ch"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset = testFileset, workflow = testWorkflow, split_algo = "FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package = "WMCore.WMBS", subscription = testSubscription) jobGroup = jobFactory(files_per_job = 1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA], "created", "submitcooloff") change.persist([testJobB], "created", "jobcooloff") change.persist([testJobC, testJobD], "new", "none") testJobA.load() testJobB.load() testJobC.load() testJobD.load() assert testJobA["retry_count"] == 1, \ "Error: Retry count is wrong." assert testJobB["retry_count"] == 1, \ "Error: Retry count is wrong." assert testJobC["retry_count"] == 0, \ "Error: Retry count is wrong." assert testJobD["retry_count"] == 0, \ "Error: Retry count is wrong." return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error( "Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return
def testJobKilling(self): """ _testJobKilling_ Test that we can successfully set jobs to the killed state """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname="Locations.New") locationAction.execute("site1", pnn="T2_CH_CERN") testWorkflow = Workflow(spec=self.specUrl, owner="Steve", name="wf001", task=self.taskName) testWorkflow.create() testFileset = Fileset(name="TestFileset") testFileset.create() for i in range(4): newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroup = jobFactory(files_per_job=1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA], "created", "new") change.persist([testJobB], "jobfailed", "executing") change.persist([testJobC, testJobD], "executing", "created") change.persist([testJobA], "killed", "created") change.persist([testJobB], "killed", "jobfailed") change.persist([testJobC, testJobD], "killed", "executing") for job in [testJobA, testJobB, testJobC, testJobD]: job.load() self.assertEqual(job['retry_count'], 99999) self.assertEqual(job['state'], 'killed') return
def testRetryCount(self): """ _testRetryCount_ Verify that the retry count is incremented when we move out of the submitcooloff or jobcooloff state. """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname = "Locations.New") locationAction.execute("site1", pnn = "T2_CH_CERN") testWorkflow = Workflow(spec=self.specUrl, owner="Steve", name="wf001", task=self.taskName) testWorkflow.create() testFileset = Fileset(name="TestFileset") testFileset.create() for i in range(4): newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroup = jobFactory(files_per_job=1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA], "created", "submitcooloff") change.persist([testJobB], "created", "jobcooloff") change.persist([testJobC, testJobD], "new", "none") testJobA.load() testJobB.load() testJobC.load() testJobD.load() assert testJobA["retry_count"] == 1, \ "Error: Retry count is wrong." assert testJobB["retry_count"] == 1, \ "Error: Retry count is wrong." assert testJobC["retry_count"] == 0, \ "Error: Retry count is wrong." assert testJobD["retry_count"] == 0, \ "Error: Retry count is wrong." return
def testPersist(self): """ _testPersist_ This is the test class for function Propagate from module ChangeState """ change = ChangeState(self.config, "changestate_t") locationAction = self.daoFactory(classname="Locations.New") locationAction.execute("site1", pnn="T2_CH_CERN") testWorkflow = Workflow(spec=self.specUrl, owner="Steve", name="wf001", task=self.taskName) testWorkflow.create() testFileset = Fileset(name="TestFileset") testFileset.create() for i in range(4): newFile = File(lfn="File%s" % i, locations=set(["T2_CH_CERN"])) newFile.create() testFileset.addFile(newFile) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=testWorkflow, split_algo="FileBased") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroup = jobFactory(files_per_job=1)[0] assert len(jobGroup.jobs) == 4, \ "Error: Splitting should have created four jobs." testJobA = jobGroup.jobs[0] testJobA["user"] = "******" testJobA["group"] = "DMWM" testJobA["taskType"] = "Processing" testJobB = jobGroup.jobs[1] testJobB["user"] = "******" testJobB["group"] = "DMWM" testJobB["taskType"] = "Processing" testJobC = jobGroup.jobs[2] testJobC["user"] = "******" testJobC["group"] = "DMWM" testJobC["taskType"] = "Processing" testJobD = jobGroup.jobs[3] testJobD["user"] = "******" testJobD["group"] = "DMWM" testJobD["taskType"] = "Processing" change.persist([testJobA, testJobB], "created", "new") change.persist([testJobC, testJobD], "new", "none") stateDAO = self.daoFactory(classname="Jobs.GetState") jobAState = stateDAO.execute(id=testJobA["id"]) jobBState = stateDAO.execute(id=testJobB["id"]) jobCState = stateDAO.execute(id=testJobC["id"]) jobDState = stateDAO.execute(id=testJobD["id"]) assert jobAState == "created" and jobBState =="created" and \ jobCState == "new" and jobDState == "new", \ "Error: Jobs didn't change state correctly." return
def killWorkflow(workflowName, jobCouchConfig, bossAirConfig=None): """ _killWorkflow_ Kill a workflow that is already executing inside the agent. This will mark all incomplete jobs as failed and files that belong to all non-cleanup and non-logcollect subscriptions as failed. The name of the JSM couch database and the URL to the database must be passed in as well so the state transitions are logged. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) killFilesAction = daoFactory(classname="Subscriptions.KillWorkflow") killJobsAction = daoFactory(classname="Jobs.KillWorkflow") killFilesAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) liveJobs = killJobsAction.execute(workflowName=workflowName, conn=myThread.transaction.conn) changeState = ChangeState(jobCouchConfig) # Deal with any jobs that are running in the batch system # only works if we can start the API if bossAirConfig: bossAir = BossAirAPI(config=bossAirConfig, noSetup=True) killableJobs = [] for liveJob in liveJobs: if liveJob["state"].lower() == 'executing': # Then we need to kill this on the batch system liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) killableJobs.append(liveJob) # Now kill them try: logging.info("Killing %d jobs for workflow: %s", len(killableJobs), workflowName) bossAir.kill(jobs=killableJobs, workflowName=workflowName) except BossAirException as ex: # Something's gone wrong. Jobs not killed! logging.error("Error while trying to kill running jobs in workflow!\n") logging.error(str(ex)) trace = getattr(ex, 'traceback', '') logging.error(trace) # But continue; we need to kill the jobs in the master # the batch system will have to take care of itself. liveWMBSJobs = defaultdict(list) for liveJob in liveJobs: if liveJob["state"] == "killed": # Then we've killed it already continue liveWMBSJob = Job(id=liveJob["id"]) liveWMBSJob.update(liveJob) liveWMBSJobs[liveJob["state"]].append(liveWMBSJob) for state, jobsByState in liveWMBSJobs.items(): if len(jobsByState) > 100 and state != "executing": # if there are to many jobs skip the couch and dashboard update # TODO: couch and dashboard need to be updated or parallel. changeState.check("killed", state) changeState.persist(jobsByState, "killed", state) else: changeState.propagate(jobsByState, "killed", state) return