def handleEvent(self,payload): """ The payload of for a cleanup handler is a job id. """ if self.failureArchive == None: logging.error("No Failure Archive set: Cannot Archive Job:\n %s" % payload) return try: logging.debug(">FailureCleanupHandler< archiving "+\ "information for jobspec: "+str(payload)) try: os.makedirs(self.failureArchive) except: pass cacheDirLocation=JobState.general(str(payload))['CacheDirLocation'] logging.debug(">FailureCleanupHandler< archiving and removing directory: "+cacheDirLocation) #NOTE: check what this does when it is repeated (e.g. after a crash) tar=tarfile.open(self.failureArchive+'/'+str(payload)+'.tar.gz','w:gz') short_root=cacheDirLocation.split('/')[-1] tar.add(cacheDirLocation,short_root) tar.close() try: for root, dirs, files in os.walk(cacheDirLocation, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.rmdir(cacheDirLocation) except Exception,ex: logging.debug(">FailureCleanupHandler< WARNING job cleanup: "+str(ex)) JobState.cleanout(str(payload)) Job.remove(str(payload)) logging.debug(">FailureCleanupHandler< archived completed for jobspecID: "+str(payload))
def eraseJob(self, jobSpecId): """ Arguments: JobSpecId -- the job id. Return: none """ logging.info("BossLiteKiller.eraseJob(%s)" % jobSpecId) # kill job self.killJob(jobSpecId, erase=True) # set number of executions to be equal to the maximum number of # allowed retries so jobs will not be resubmitted, or even # not submitted at all if they have not been submitted yet try: JobState.doNotAllowMoreSubmissions([jobSpecId]) except ProdAgentException, ex: msg = "Updating max racers fields failed for job %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise
def testG(self): try: reportList=JobState.jobReports("jobClassID4") self.assertEqual(JobState.jobReports("jobClassID4"), \ ['job/Report/Location4.0.xml','job/Report/Location4.1.xml', 'job/Report/Location4.2.xml']) except StandardError, ex: msg = "Failed State Change TestG:\n" msg += str(ex) self.fail(msg)
def testB(self): Session.set_database(dbConfig) Session.connect() Session.start_transaction() try: JobState.purgeStates() except StandardError, ex: msg = "Failed State Change TestB:\n" msg += str(ex) self.fail(msg)
def testF(self): try: self.assertEqual(JobState.lastLocations("jobClassID4"),\ ["some.location4.0","some.location4.1","some.location4.2"]) self.assertEqual(JobState.lastLocations("jobClassID2"),\ ["some.location2.1"]) except StandardError, ex: msg = "Failed State Change TestF:\n" msg += str(ex) self.fail(msg)
def testA(self): """change state test""" Session.set_database(dbConfig) Session.connect() Session.start_transaction() try: for i in [1,2]: JobState.cleanout("jobClassID"+str(i)) except StandardError, ex: msg = "Failed State Change TestA:\n" msg += str(ex) self.fail(msg)
def __avoidResubmission(self, jobSpecId): """ Set number of executions to be equal to the maximum number of allowed retries so jobs will not be resubmitted, or even not submitted at all if they have not been submitted yet. """ try: JobState.doNotAllowMoreSubmissions([jobSpecId]) except ProdAgentException, ex: msg = "Updating max racers fields failed for job %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise
def testF(self): print("""\nSet the job cache (used for failure job cleanup)""") try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.failureJobSpecs): JobState.register("failureJobSpec"+str(i),"Processing",2,2) JobState.create("failureJobSpec"+str(i),self.location+"/failureJobSpecDir_"+str(i)) Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testB:\n" msg += str(ex) self.fail(msg)
def eraseWorkflow(self, workflowSpecId): """ Arguments: workflowSpecId -- the workflow id. Return: none """ logging.info("TaskQueueKiller.eraseWorkflow(%s)" % workflowSpecId) # get job ids for workflows workflowSpecId jobs = JobState.retrieveJobIDs([workflowSpecId]) jobs = map(lambda x: x[0], jobs) logging.debug("TaskQueueKiller-> jobs: %s" % str(jobs)) totalJobs = len(jobs) if totalJobs == 0: logging.info("No jobs associated to the workflow %s" % \ workflowSpecId) return # Kill all jobs (those not in the queue will be ignored) self.tqApi.killTasks(jobs) # Avoid resubmission for job in jobs: self.__avoidResubmission(job) return
def testH(self): return print("""\nCleanup the prodagent database""") print("\nsleep for 20 seconds to") print("let the cleanup component receive the messages") time.sleep(20) try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() JobState.purgeStates() self.ms.purgeMessages() Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testE:\n" msg += str(ex) self.fail(msg)
def testD(self): """change state test""" try: JobState.register("jobClassID4","Processing",6,2,"myWorkflowID") JobState.create("jobClassID4","cacheDir/location/4somewhere") JobState.inProgress("jobClassID4") # retries=racers=0 self.assertEqual(JobState.general("jobClassID4"),{'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/4somewhere', 'MaxRacers': 2L, 'Racers': 0L, 'State': 'inProgress', 'MaxRetries': 6L, 'JobType': 'Processing'})
def handleEvent(self,payload): """ The payload of a partial cleanup handler is a job id and the event (plus payload) it needs to emit aferwards. """ payloads=payload.split(',') jobId=payloads[0] nextEvent=payloads[1] nextPayload=payloads[2] delay=0 if len(payloads)==4: delay=payloads[3] try: logging.debug(">PartialCleanupHandler< removing cached files "+\ "for jobspec: "+str(jobId)) cacheDirLocation=JobState.general(str(jobId))['CacheDirLocation'] logging.debug(">PartialCleanupHandler< starting remove in: "+cacheDirLocation) try: for root, dirs, files in os.walk(cacheDirLocation, topdown=False): for name in files: # check if file is an .xml or .tar.gz file # if so do not remove. # NOTE: should use reg. exp. here. isSaved=False # we only keep files that are in the top dir. # if we in the top dir we check for certain extensions. if root==cacheDirLocation: extensions=['.xml','.tar.gz'] for extension in extensions: pos1=name.rfind(extension) pos2=len(name)-len(extension) if(pos1==pos2): isSaved=True break if not isSaved: try: os.remove(os.path.join(root, name)) except Exception,ex: logging.debug(">PartialCleanupHandler< WARNING "+\ " partial job cleanup: "+str(ex)) else: logging.debug(">PartialCleanupHandler< not removing: "+name) for name in dirs: os.rmdir(os.path.join(root, name)) except Exception,ex: logging.debug(">PartialCleanupHandler< WARNING partial job cleanup: "+\ str(ex)) except Exception,ex: logging.debug(">PartialCleanupHandler< ERROR partial job cleanup: "+\ str(ex))
def getJobCache(self, jobSpecId): """ _getJobCache_ Lookup a job cache for the job spec Id provided """ try: stateInfo = JobState.general(jobSpecId) except Exception, ex: msg = "ERROR: Cant get JobCache for %s\n" % jobSpecId msg += str(ex) logging.warning(msg) stateInfo = {}
def handleError(self,payload): jobId = payload generalInfo=JobState.general(jobId) delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">CreateFailureHandler<: re-creating with delay "+\ " (h:m:s) "+str(delay)) try: JobState.createFailure(jobId) logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure,"\ "publishing a create event") self.publishEvent("CreateJob",(jobId),delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure "+ \ "Maximum number of retries reached!" +\ " Submitting a general failure and cleanup job event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))
def killWorkflow(self, workflowSpecId): """ Arguments: workflowSpecId -- the workflow id. Return: none """ logging.info("BossLiteKiller.killWorkflow(%s)" % workflowSpecId) # get job ids for workflows workflowSpecId jobs = JobState.retrieveJobIDs([workflowSpecId]) totalJobs = len(jobs) if totalJobs == 0: logging.info("No jobs associated to the workflow %s" % \ workflowSpecId) return skippedJobs = 0 # kill all jobs for job in jobs: jobName = job[0] # kill each one independently try: self.killJob(jobName) # if job is not found (may be finished right now), killJob() # has printed the error message. Try the next one except InvalidJobException, msg: logging.debug("job %s not yet created, no need to kill it" % \ jobName) skippedJobs += 1 # not yet submitted, no need to kill it except JobNotSubmittedException, msg: logging.debug( "job %s not yet submitted or finished, no need to kill it" \ % jobName)
def testA(self): """change state test""" Session.set_database(dbConfig) Session.connect() Session.start_transaction() try: # illegal state transitions: try: JobState.create("jobClassID1","cacheDir/location/1somewhere") except ProdException, ex: print('>>>Test succeeded for exception 1/3 in testA of JobState_t.py\n') self.assertEqual(JobState.isRegistered("jobClassID1"),False) JobState.register("jobClassID1","Processing",3,1,"myWorkflowID") self.assertEqual(JobState.isRegistered("jobClassID1"),True) # register again (illegal): try: JobState.register("jobClassID1","Processing",3,1,"myWorkflowID") print('>>>Test ERROR \n') except ProdException, ex: print('>>>Test succeeded for exception 2/3 in testA of JobState_t.py\n')
def killJob(self, jobSpecId, erase=False): """ Arguments: JobSpecId -- the job id. erase -- remove job info from BOSS database Return: none """ # jobSpecId is job['name'] for BossLite # Fabio logging.info("BossLiteKiller.killJob(%s)" % jobSpecId) # verify that the job exists try: stateInfo = JobState.general(jobSpecId) except StandardError, ex: msg = "Cannot retrieve JobState Information for %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise InvalidJobException, msg
# kill job self.killJob(jobSpecId, erase=True) # set number of executions to be equal to the maximum number of # allowed retries so jobs will not be resubmitted, or even # not submitted at all if they have not been submitted yet try: JobState.doNotAllowMoreSubmissions([jobSpecId]) except ProdAgentException, ex: msg = "Updating max racers fields failed for job %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise # remove all entries JobState.cleanout(jobSpecId) def eraseWorkflow(self, workflowSpecId): """ Arguments: workflowSpecId -- the workflow id. Return: none """ logging.info("BossLiteKiller.eraseWorkflow(%s)" % workflowSpecId)
""" Set number of executions to be equal to the maximum number of allowed retries so jobs will not be resubmitted, or even not submitted at all if they have not been submitted yet. """ try: JobState.doNotAllowMoreSubmissions([jobSpecId]) except ProdAgentException, ex: msg = "Updating max racers fields failed for job %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise # remove all entries JobState.cleanout(jobSpecId) def eraseJob(self, jobSpecId): """ Arguments: JobSpecId -- the job id. Return: none """ logging.info("TaskQueueKiller.eraseJob(%s)" % jobSpecId) # kill job self.killJob(jobSpecId)
def testE(self): try: JobState.register("jobClassID5","Processing",2,2,"myWorkflowID") JobState.create("jobClassID5","cacheDir/location/5somewhere") JobState.inProgress("jobClassID5") JobState.submit("jobClassID5") # now introduce some failures until we have more failures # then retries (this raises an error) JobState.runFailure("jobClassID5","jobInstanceID5.1", "some.location5.1","job/Report/Location5.1.xml") try: JobState.runFailure("jobClassID5","jobInstanceID5.2", "some.location5.1","job/Report/Location5.1.xml") except ProdException, ex: print('>>>Test succeeded for exception 1/1 in testE of JobState_t.py\n') JobState.finished("jobClassID5")
def testH(self): JobState.register("jobClassID7","Processing",8,2,"myWorkflowID") JobState.register("jobClassID8","Processing",8,2,"myWorkflowID") JobState.register("jobClassID9","Processing",8,2,"myWorkflowID")
def testC(self): """change state test""" try: JobState.register("jobClassID3","Merge",5,1,"myWorkflowID") JobState.create("jobClassID3","cacheDir/location/3somewhere") JobState.inProgress("jobClassID3") JobState.submit("jobClassID3") # try an illegal state transition: try: JobState.create("jobClassID3","cacheDir/location3somewhere") except ProdException, ex: print('>>>Test succeeded for exception 1/3 in testC of JobState_t.py\n') # try to submit another job while the first one has not finished (we only are allowed one racer) try: JobState.submit("jobClassID3") except ProdException, ex: print('>>>Test succeeded for exception 2/3 in testC of JobState_t.py\n')
def testK(self): jobIDs=[] for i in xrange(0,20): JobState.register("jobClassID_0."+str(i),"Processing",30,1) JobState.register("jobClassID_1."+str(i),"Processing",30,1,"myWorkflowID1") JobState.register("jobClassID_2."+str(i),"Processing",30,1,"myWorkflowID2") JobState.register("jobClassID_3."+str(i),"Processing",30,1,"myWorkflowID3") jobIDs.append("jobClassID_1."+str(i)) jobIDs.append("jobClassID_2."+str(i)) jobIDs.append("jobClassID_3."+str(i)) JobState.setMaxRetries(jobIDs,2) self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],2) JobState.setMaxRetries("jobClassID_1.1",3) self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],3) jobIDs=JobState.retrieveJobIDs("myWorkflowID1") self.assertEqual(len(jobIDs),20) jobIDs=JobState.retrieveJobIDs(["myWorkflowID1","myWorkflowID2","myWorkflowID3"]) self.assertEqual(len(jobIDs),60) jobs=JobState.rangeGeneral(0,10) print(str(jobs))
def testI(self): JobState.register("jobClassID10","Processing",8,2,"myWorkflowID") #retries=racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 0, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.createFailure("jobClassID10") #retries=1, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 1, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.createFailure("jobClassID10") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.create("jobClassID10","cacheDir/location/10somewhere") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'create', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.inProgress("jobClassID10") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submitFailure("jobClassID10") #retries=3, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submit("jobClassID10") #retries=3, racer=1 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submitFailure("jobClassID10") #retries=4, racer=1 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submit("jobClassID10") #retries=4, racer=2 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 2, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) # on purpose we introduce an error: try: JobState.submit("jobClassID10") except ProdException, ex: print('>>>Test succeeded for exception 1/1 in testH of JobState_t.py\n')
def handleError(self,payload): """ The payload of a job failure is a url to the job report """ jobReportUrl= payload # prepare to retrieve the job report file. # NOTE: we assume that the report file has a relative unique name # NOTE: if that is not the case we need to add a unique identifier to it. slash = jobReportUrl.rfind('/') fileName = jobReportUrl[slash+1:] urllib.urlretrieve(jobReportUrl, \ self.args['jobReportLocation']+'/'+fileName) logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl) jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName) #NOTE: is this the right way to extract the job id. jobId=jobReport[0].jobSpecId logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\ "(used to dynamically load error handler) " \ "jobId="+str(jobId)) # create the jobReportLocation jobId hierarchy if not exists. pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId) pipe.close() # move the report file to this new location. pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \ self.args['jobReportLocation']+'/'+jobId) logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \ +self.args['jobReportLocation']+'/'+jobId) pipe.close() reportLocation=self.args['jobReportLocation']+'/'+ \ jobId+'/'+fileName generalInfo=JobState.general(jobId) # a submit event with delay delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\ str(delay)) if self.args['ReportAction'] == 'move' : # count how many files are in the dir (to generate unique ids # when moving files try: lastID = len(os.listdir(os.path.dirname(payload))) target = os.path.join(os.path.dirname(payload),\ os.path.basename(payload).split('.')[0] +\ str(lastID) +\ '.xml') logging.debug('Moving file: '+ payload + ' to: ' + target) shutil.move(payload,target) except: pass try: JobState.runFailure(jobId,jobReportLocation= reportLocation) # check the cache dir size. If it is beyond the threshold, purge it. dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0) dirSizeMegaBytes=convertSize(dirSizeBytes,'m') logging.debug(">RunFailureHandler<:Cache dir. size is "+\ str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\ str(self.maxCacheDirSizeMB)+" MB ") jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId) # if necessary first a partial cleanup is done, which after it # is finished publishes the proper event. # retrieve the number of retries and publish if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)): newPayload=jobId+",SubmitJob,"+jobId+","+str(delay) logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\ "Performing partial cache cleanup first.") self.publishEvent("PartialJobCleanup",newPayload,delay) else: logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure,"\ "publishing a submit job event") if self.args['QueueFailures']: JobQueueAPI.reQueueJob(jobId) else: self.publishEvent("SubmitJob",jobspecfile,delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure "+ \ "Maximum number of retries reached!" +\ " Submitting a failure job and cleanup event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))
def testB(self): """change state test""" try: JobState.register("jobClassID2","Processing",2,1,"myWorkflowID") JobState.create("jobClassID2","cacheDir/location/2somewhere") JobState.inProgress("jobClassID2") # retries=racers=0 self.assertEqual(JobState.general("jobClassID2"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries0,=racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.runFailure("jobClassID2","jobInstanceID2.1", "some.location2.1","job/Report/Location2.1.xml") # retries= 1, racers=0 self.assertEqual(JobState.general("jobClassID2"), {'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'Retries': 1, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries= 1, racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 1L, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 2L, 'JobType': 'Processing'}) except StandardError, ex: msg = "Failed State Change TestB:\n" msg += str(ex) self.fail(msg)
self.assertEqual(JobState.isRegistered("jobClassID1"),False) JobState.register("jobClassID1","Processing",3,1,"myWorkflowID") self.assertEqual(JobState.isRegistered("jobClassID1"),True) # register again (illegal): try: JobState.register("jobClassID1","Processing",3,1,"myWorkflowID") print('>>>Test ERROR \n') except ProdException, ex: print('>>>Test succeeded for exception 2/3 in testA of JobState_t.py\n') try: # illegal state transitions: JobState.inProgress("jobClassID1") except ProdException, ex: print('>>>Test succeeded for exception 3/3 in testA of JobState_t.py\n') JobState.create("jobClassID1","cacheDir/location/1somewhere") JobState.inProgress("jobClassID1") # retries=racers=0; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 3, 'JobType': 'Processing'} ) JobState.submit("jobClassID1") # retries=0, racers=1; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 3L, 'JobType': 'Processing'}) JobState.runFailure("jobClassID1","jobInstanceID1.1", "some.location1.1","job/Report/Location1.1.xml") JobState.submit("jobClassID1") except StandardError, ex: