def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK()
def initialize(self): """ Initialize the agent. """ self.am_setOption("PollingTime", 60) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK()
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters(jobIDList, 'OutputSandboxLFN') if not result['OK']: return result osLFNList = result['Value'] if not osLFNList: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNList.iteritems(): lfn = outputSandboxLFNdict['OutputSandboxLFN'] result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result)
def checkJobStateTransition(jobID, candidateState, currentStatus=None, jobMonitoringClient=None): """Utility to check if a job state transition is allowed""" if not currentStatus: if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient jobMonitoringClient = JobMonitoringClient() res = jobMonitoringClient.getJobsStatus(jobID) if not res["OK"]: return res try: currentStatus = res["Value"][jobID]["Status"] except KeyError: return S_ERROR("Job does not exist") res = JobsStateMachine(currentStatus).getNextState(candidateState) if not res["OK"]: return res # If the JobsStateMachine does not accept the candidate, return an ERROR if candidateState != res["Value"]: gLogger.error( "Job Status Error", "%s can't move from %s to %s" % (jobID, currentStatus, candidateState), ) return S_ERROR("Job state transition not allowed") return S_OK()
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( )
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not outputDataModule: self.outputDataModule = gConfig.getValue( "/DIRAC/VOPolicy/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not jobClass: from DIRAC.Interfaces.API.Job import Job self.jobClass = Job else: self.jobClass = jobClass if not opsH: from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations self.opsH = Operations() else: self.opsH = opsH
def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = JobMonitoringClient().getJobParameter( jobID, 'OutputSandboxLFN') if result['OK']: lfn = result['Value'].get('OutputSandboxLFN') if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.error('Error interrogating JobDB: %s' % result['Message']) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn, jobID in lfnDict.items(): result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result)
def deleteJobOversizedSandbox(self, jobIDList): """ Deletes the job oversized sandbox files from storage elements. Creates a request in RMS if not immediately possible. :param list jobIDList: list of job IDs :returns: S_OK/S_ERROR """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters( jobIDList, ["OutputSandboxLFN"]) if not result["OK"]: return result osLFNDict = result["Value"] if not osLFNDict: return S_OK({"Successful": successful, "Failed": failed}) osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1]) self.log.verbose("Deleting oversized sandboxes", osLFNDict) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNDict.items( ): # can be an iterator lfn = outputSandboxLFNdict["OutputSandboxLFN"] result = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not result["OK"]: failed[jobID] = lfn continue if not result["Value"]: failed[jobID] = lfn continue ownerDN = result["Value"]["OwnerDN"] ownerGroup = result["Value"]["OwnerGroup"] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result["OK"]: failed[jobID] = lfn else: successful[jobID] = lfn result = {"Successful": successful, "Failed": failed} return S_OK(result)
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference', 'Unknown') if pilotReference == 'Unknown': # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message'])) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: %s' % (jobID, result['Message'])) return result pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
def _getJobPilotStatus(self, jobID): """Get the job pilot status""" result = JobMonitoringClient().getJobParameter(jobID, "Pilot_Reference") if not result["OK"]: return result pilotReference = result["Value"].get("Pilot_Reference", "Unknown") if pilotReference == "Unknown": # There is no pilot reference, hence its status is unknown return S_OK("NoPilot") result = PilotManagerClient().getPilotInfo(pilotReference) if not result["OK"]: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result["Message"])) return S_OK("NoPilot") self.log.error("Failed to get pilot information", "for job %d: %s" % (jobID, result["Message"])) return result pilotStatus = result["Value"][pilotReference]["Status"] return S_OK(pilotStatus)
def __call__(self): """request processing""" self.log.debug("about to execute request") if not self.rmsMonitoring: gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient( useCertificates=True) res = monitorServer.getJobSummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as e: error = str(e) self.log.exception("hit by exception:", "%s" % error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if not self.rmsMonitoring: gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
def finalizeRequest(self, requestID, jobID, useCertificates=True): """check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestID: request id :param int jobID: job id """ stateServer = JobStateUpdateClient(useCertificates=useCertificates) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestID) if not res["OK"]: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestID, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestID, res["Value"])) # The request is 'Done', let's update the job status. If we fail, we should re-try later monitorServer = JobMonitoringClient(useCertificates=useCertificates) res = monitorServer.getJobSummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return res elif not res["Value"]: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]["Status"] jobMinorStatus = res["Value"]["MinorStatus"] jobAppStatus = "" newJobStatus = "" if jobStatus == JobStatus.STALLED: # If job is stalled, find the previous status from the logging info res = monitorServer.getJobLoggingInfo(int(jobID)) if not res["OK"]: self.log.error( "finalizeRequest: Failed to get job logging info", "JobID: %d" % jobID) return res # Check the last status was Stalled and get the one before if len(res["Value"] ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED: jobStatus, jobMinorStatus, jobAppStatus = res["Value"][ -2][:3] newJobStatus = jobStatus # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestID) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestID, digest["Message"])) if jobStatus == JobStatus.COMPLETED: # What to do? Depends on what we have in the minorStatus if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS: newJobStatus = JobStatus.DONE elif jobMinorStatus == JobMinorStatus.APP_ERRORS: newJobStatus = JobStatus.FAILED elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION: # If the job has been Killed, set it Killed newJobStatus = JobStatus.KILLED else: self.log.error( "finalizeRequest: Unexpected jobMinorStatus", "for %d (got %s)" % (jobID, jobMinorStatus)) return S_ERROR("Unexpected jobMinorStatus") if newJobStatus: self.log.info( "finalizeRequest: Updating job status", "for %d to '%s/%s'" % (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE), ) else: self.log.info( "finalizeRequest: Updating job minor status", "for %d to '%s' (current status is %s)" % (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus), ) stateUpdate = stateServer.setJobStatus( jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS") if jobAppStatus and stateUpdate["OK"]: stateUpdate = stateServer.setJobApplicationStatus( jobID, jobAppStatus, "RMS") if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]), ) return stateUpdate return S_OK(newJobStatus)
jobs = [] for arg in Script.getPositionalArgs(): try: jobs += [int(job) for job in arg.split(',')] except ValueError: gLogger.fatal("Invalid list of jobIDs") DIRAC.exit(2) from DIRAC.DataManagementSystem.Client.DataManager import DataManager from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient from DIRAC.Core.Utilities.SiteSEMapping import getSEsForSite dm = DataManager() bk = BookkeepingClient() monitoring = JobMonitoringClient() if not jobs: conditions = { 'Status': 'Failed', 'MinorStatus': 'Maximum of reschedulings reached', 'ApplicationStatus': 'Failed Input Data Resolution ' } prStr = 'all jobs' if production: prStr = 'production %s' % ' '.join(production) if len(production) == 1: production = production[0] conditions['JobGroup'] = production if userName: prStr = 'user %s' % userName
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'DataRecoveryAgent' self.enabled = False self.productionsToIgnore = self.am_getOption("ProductionsToIgnore", []) self.transformationTypes = self.am_getOption("TransformationTypes", [ 'MCReconstruction', 'MCSimulation', 'MCReconstruction_Overlay', 'MCGeneration' ]) self.transformationStatus = self.am_getOption("TransformationStatus", ['Active', 'Completing']) self.shifterProxy = self.am_setOption('shifterProxy', 'DataManager') self.jobStatus = [ 'Failed', 'Done' ] ##This needs to be both otherwise we cannot account for all cases self.jobMon = JobMonitoringClient() self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.diracILC = DiracILC() self.inputFilesProcessed = set() self.todo = {'MCGeneration': [ dict( Message="MCGeneration: OutputExists: Job 'Done'", ShortMessage="MCGeneration: job 'Done' ", Counter=0, Check=lambda job: job.allFilesExist() and job.status=='Failed', Actions=lambda job,tInfo: [ job.setJobDone(tInfo) ] ), dict( Message="MCGeneration: OutputMissing: Job 'Failed'", ShortMessage="MCGeneration: job 'Failed' ", Counter=0, Check=lambda job: job.allFilesMissing() and job.status=='Done', Actions=lambda job,tInfo: [ job.setJobFailed(tInfo) ] ), # dict( Message="MCGeneration, job 'Done': OutputExists: Task 'Done'", # ShortMessage="MCGeneration: job already 'Done' ", # Counter=0, # Check=lambda job: job.allFilesExist() and job.status=='Done', # Actions=lambda job,tInfo: [ tInfo._TransformationInfo__setTaskStatus(job, 'Done') ] # ), ], 'OtherProductions': [ \ ## should always be first! dict( Message="One of many Successful: clean others", ShortMessage="Other Tasks --> Keep", Counter=0, Check=lambda job: job.allFilesExist() and job.otherTasks and job.inputFile not in self.inputFilesProcessed, Actions=lambda job,tInfo: [ self.inputFilesProcessed.add(job.inputFile), job.setJobDone(tInfo), job.setInputProcessed(tInfo) ] ), dict( Message="Other Task processed Input, no Output: Fail", ShortMessage="Other Tasks --> Fail", Counter=0, Check=lambda job: job.inputFile in self.inputFilesProcessed and job.allFilesMissing() and job.status!='Failed', Actions=lambda job,tInfo: [ job.setJobFailed(tInfo) ] ), dict( Message="Other Task processed Input: Fail and clean", ShortMessage="Other Tasks --> Cleanup", Counter=0, Check=lambda job: job.inputFile in self.inputFilesProcessed and not job.allFilesMissing(), Actions=lambda job,tInfo: [ job.setJobFailed(tInfo), job.cleanOutputs(tInfo) ] ), dict( Message="InputFile missing: mark job 'Failed', mark input 'Deleted', clean", ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup", Counter=0, Check=lambda job: job.inputFile and not job.inputFileExists and job.fileStatus != "Deleted", Actions=lambda job,tInfo: [ job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo) ] ), dict( Message="InputFile Deleted, output Exists: mark job 'Failed', clean", ShortMessage="Input Deleted --> Job 'Failed, Cleanup", Counter=0, Check=lambda job: job.inputFile and not job.inputFileExists and job.fileStatus == "Deleted" and not job.allFilesMissing(), Actions=lambda job,tInfo: [ job.cleanOutputs(tInfo), job.setJobFailed(tInfo) ] ), ## All Output Exists dict( Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed", ShortMessage="Output Exists --> Job Done, Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status=='Failed' and \ job.fileStatus!="Processed" and \ job.inputFileExists, Actions=lambda job,tInfo: [ job.setJobDone(tInfo), job.setInputProcessed(tInfo) ] ), dict( Message="Output Exists, job Failed, input Processed --> Job Done", ShortMessage="Output Exists --> Job Done", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status=='Failed' and \ job.fileStatus=="Processed" and \ job.inputFileExists, Actions=lambda job,tInfo: [ job.setJobDone(tInfo) ] ), dict( Message="Output Exists, job Done, input not Processed --> Input Processed", ShortMessage="Output Exists --> Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status=='Done' and \ job.fileStatus!="Processed" and \ job.inputFileExists, Actions=lambda job,tInfo: [ job.setInputProcessed(tInfo) ] ), ## outputmissing dict( Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset", ShortMessage="Max ErrorCount --> Input MaxReset", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status=='Failed' and \ job.fileStatus in ASSIGNEDSTATES and \ job.inputFile not in self.inputFilesProcessed and \ job.inputFileExists and \ job.errorCount > MAXRESET, Actions=lambda job,tInfo: [ job.setInputMaxReset(tInfo) ] ), dict( Message="Output Missing, job Failed, input Assigned --> Input Unused", ShortMessage="Output Missing --> Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status=='Failed' and \ job.fileStatus in ASSIGNEDSTATES and \ job.inputFile not in self.inputFilesProcessed and \ job.inputFileExists, Actions=lambda job,tInfo: [ job.setInputUnused(tInfo) ] ), dict( Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused", ShortMessage="Output Missing --> Job Failed, Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status=='Done' and \ job.fileStatus in ASSIGNEDSTATES and \ job.inputFile not in self.inputFilesProcessed and \ job.inputFileExists, Actions=lambda job,tInfo: [ job.setInputUnused(tInfo), job.setJobFailed(tInfo) ] ), ## some files missing, needing cleanup. Only checking for ## assigned, because processed could mean an earlier job was ## succesful and this one is just the duplicate that needed ## to be removed! But we check for other tasks earlier, so ## this should not happen dict( Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status=='Failed' and \ job.fileStatus in ASSIGNEDSTATES and \ job.inputFileExists, Actions=lambda job,tInfo: [job.cleanOutputs(tInfo),job.setInputUnused(tInfo)] #Actions=lambda job,tInfo: [] ), dict( Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status=='Done' and \ job.fileStatus in ASSIGNEDSTATES and \ job.inputFileExists, Actions=lambda job,tInfo: [job.cleanOutputs(tInfo),job.setInputUnused(tInfo),job.setJobFailed(tInfo)] #Actions=lambda job,tInfo: [] ), dict( Message="Some missing, job Done --> job Failed", ShortMessage="Output Missing, Done --> Job Failed", Counter=0, Check=lambda job: not job.allFilesExist() and job.status=='Done', Actions=lambda job,tInfo: [job.setJobFailed(tInfo)] ), dict ( Message="Something Strange", ShortMessage="Strange", Counter=0, Check=lambda job: job.status not in ("Failed","Done"), Actions=lambda job,tInfo: [] ), ##should always be the last one! dict ( Message="Failed Hard", ShortMessage="Failed Hard", Counter=0, Check=lambda job: False, ## never Actions=lambda job,tInfo: [] ), ] } self.jobCache = defaultdict(lambda: (0, 0)) self.printEveryNJobs = self.am_getOption('PrintEvery', 200) ##Notification self.notesToSend = "" self.addressTo = self.am_getOption('MailTo', ["*****@*****.**"]) self.addressFrom = self.am_getOption('MailFrom', "*****@*****.**") self.subject = "DataRecoveryAgent"
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(isinstance(res['Value'], int), msg="Got %s" % type(res['Value'])) self.assertEqual(res['Value'], res['JobID'], msg="Got %s, expected %s" % (str(res['Value']), res['JobID'])) jobID = res['JobID'] jobID = res['Value'] # updating the status res = jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') self.assertTrue(res['OK'], res.get('Message')) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res['OK'], res.get('Message')) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Received', msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'MinorStatus': 'Job Rescheduled', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'ApplicationStatus': 'Unknown', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) # updating the status again res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Killed', msg="Got %s" % str(res['Value'])) # updating the status aaaagain res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], 'Done', msg="Got %s" % str(res['Value'])) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Deleted', msg="Got %s" % str(res['Value']))
def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'DataRecoveryAgent' self.enabled = False self.getJobInfoFromJDLOnly = False self.__getCSOptions() self.jobStatus = [ 'Failed', 'Done' ] # This needs to be both otherwise we cannot account for all cases self.jobMon = JobMonitoringClient() self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.diracAPI = Dirac() self.inputFilesProcessed = set() self.todo = {'NoInputFiles': [dict(Message="NoInputFiles: OutputExists: Job 'Done'", ShortMessage="NoInputFiles: job 'Done' ", Counter=0, Check=lambda job: job.allFilesExist() and job.status == 'Failed', Actions=lambda job, tInfo: [job.setJobDone(tInfo)], ), dict(Message="NoInputFiles: OutputMissing: Job 'Failed'", ShortMessage="NoInputFiles: job 'Failed' ", Counter=0, Check=lambda job: job.allFilesMissing() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)], ), ], 'InputFiles': [ \ # must always be first! dict(Message="One of many Successful: clean others", ShortMessage="Other Tasks --> Keep", Counter=0, Check=lambda job: job.allFilesExist() and job.otherTasks and \ not set(job.inputFiles).issubset(self.inputFilesProcessed), Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles), job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Other Task processed Input, no Output: Fail", ShortMessage="Other Tasks --> Fail", Counter=0, Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allFilesMissing() and job.status != 'Failed', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Other Task processed Input: Fail and clean", ShortMessage="Other Tasks --> Cleanup", Counter=0, Check=lambda job: set(job.inputFiles).issubset( self.inputFilesProcessed) and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)] ), dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean", ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ not job.allTransFilesDeleted(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo)], ), dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean", ShortMessage="Input Deleted --> Job 'Failed, Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ job.allTransFilesDeleted() and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)], ), # All Output Exists dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed", ShortMessage="Output Exists --> Job Done, Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Output Exists, job Failed, input Processed --> Job Done", ShortMessage="Output Exists --> Job Done", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo)] ), dict(Message="Output Exists, job Done, input not Processed --> Input Processed", ShortMessage="Output Exists --> Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Done' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)] ), # outputmissing dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset", ShortMessage="Max ErrorCount --> Input MaxReset", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist() and \ job.checkErrorCount(), Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)] ), dict(Message="Output Missing, job Failed, input Assigned --> Input Unused", ShortMessage="Output Missing --> Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo)] ), dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused", ShortMessage="Output Missing --> Job Failed, Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), # some files missing, needing cleanup. Only checking for # assigned, because processed could mean an earlier job was # succesful and this one is just the duplicate that needed # to be removed! But we check for other tasks earlier, so # this should not happen dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)] ), dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [ job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), dict(Message="Some missing, job Done --> job Failed", ShortMessage="Output Missing, Done --> Job Failed", Counter=0, Check=lambda job: not job.allFilesExist() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Something Strange", ShortMessage="Strange", Counter=0, Check=lambda job: job.status not in ("Failed", "Done"), Actions=lambda job, tInfo: [] ), # should always be the last one! dict(Message="Failed Hard", ShortMessage="Failed Hard", Counter=0, Check=lambda job: False, # never Actions=lambda job, tInfo: [] ), ] } self.jobCache = defaultdict(lambda: (0, 0)) # Notification options self.notesToSend = "" self.subject = "DataRecoveryAgent" self.startTime = time.time()
def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, 'CPUNormalizationFactor') if not result['OK'] or not result['Value']: self.log.error( 'Error getting Job Parameter CPUNormalizationFactor, setting 0', result.get('Message', 'No such value')) cpuNormalization = 0.0 else: cpuNormalization = float( result['Value'].get('CPUNormalizationFactor')) except Exception as e: self.log.exception( "Exception in __sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result
import time __RCSID__ = "$Id$" from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient # sut from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient from DIRAC.tests.Integration.WorkloadManagementSystem.Test_Client_WMS import helloWorldJob, createFile jobMonitoringClient = JobMonitoringClient() jobStateUpdateClient = JobStateUpdateClient() def createJob(): job = helloWorldJob() jobDescription = createFile(job) wmsClient = WMSClient() res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) assert res['OK'], res['Message'] jobID = int(res['Value']) return jobID
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def _sendAccounting(self, jobID): """ Send WMS accounting data for the given job. Run inside thread. """ try: accountingReport = Job() endTime = "Unknown" lastHeartBeatTime = "Unknown" result = self.jobDB.getJobAttributes(jobID) if not result["OK"]: return result jobDict = result["Value"] startTime, endTime = self._checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self._checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, "CPUNormalizationFactor") if not result["OK"] or not result["Value"]: self.log.error( "Error getting Job Parameter CPUNormalizationFactor, setting 0", result.get("Message", "No such value"), ) cpuNormalization = 0.0 else: cpuNormalization = float( result["Value"].get("CPUNormalizationFactor")) except Exception as e: self.log.exception( "Exception in _sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e, ) return S_ERROR("Exception") processingType = self._getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { "Site": jobDict["Site"], "User": jobDict["Owner"], "UserGroup": jobDict["OwnerGroup"], "JobGroup": jobDict["JobGroup"], "JobType": jobDict["JobType"], "JobClass": jobDict["JobSplitType"], "ProcessingType": processingType, "FinalMajorStatus": JobStatus.FAILED, "FinalMinorStatus": JobMinorStatus.STALLED_PILOT_NOT_RUNNING, "CPUTime": lastCPUTime, "NormCPUTime": lastCPUTime * cpuNormalization, "ExecTime": lastWallTime, "InputDataSize": 0.0, "OutputDataSize": 0.0, "InputDataFiles": 0, "OutputDataFiles": 0, "DiskSpace": 0.0, "InputSandBoxSize": 0.0, "OutputSandBoxSize": 0.0, "ProcessedEvents": 0, } # For accidentally stopped jobs ExecTime can be not set if not acData["ExecTime"]: acData["ExecTime"] = acData["CPUTime"] elif acData["ExecTime"] < acData["CPUTime"]: acData["ExecTime"] = acData["CPUTime"] self.log.verbose("Accounting Report is:") self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result["OK"]: self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True") else: self.log.error( "Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"])) return result
def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def __init__(self): TaskBase.__init__(self) self.submissionClient = WMSClient() self.jobMonitoringClient = JobMonitoringClient()
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') # reset the job res = wmsClient.resetJob(jobID) self.assert_(res['OK']) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Received') # updating the status again jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Killed') # updating the status aaaagain jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Done') # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Deleted')
def getProductionApplicationSummary(self, productionID, status=None, minorStatus=None, printOutput=False): """Returns an application status summary for the productions in the system. If printOutput is specified, the result is printed to the screen. This queries the WMS for the given productionID and provides an up-to-date snapshot of the application status combinations and associated WMS JobIDs. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') statusDict = self.getProdJobMetadata(productionID, status, minorStatus) if not statusDict['OK']: self.log.warn('Could not get production metadata information') return statusDict jobIDs = list(statusDict['Value']) if not jobIDs: return S_ERROR('No JobIDs with matching conditions found') self.log.verbose('Considering %s jobs with selected conditions' % (len(jobIDs))) # now need to get the application status information result = JobMonitoringClient().getJobsApplicationStatus(jobIDs) if not result['OK']: self.log.warn('Could not get application status for jobs list') return result appStatus = result['Value'] # self._prettyPrint(appStatus) # self._prettyPrint(statusDict['Value']) # Now format the result. summary = {} submittedJobs = 0 doneJobs = 0 for job, atts in statusDict['Value'].iteritems(): for key, val in atts.iteritems(): if key == 'Status': uniqueStatus = val.capitalize() if uniqueStatus not in summary: summary[uniqueStatus] = {} if atts['MinorStatus'] not in summary[uniqueStatus]: summary[uniqueStatus][atts['MinorStatus']] = {} if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['JobList'] = [ job ] else: if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = [job] else: current = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['Total'] summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = current + 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 jobList = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['JobList'] jobList.append(job) summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = jobList if not printOutput: result = S_OK() if not status and not minorStatus: result['Totals'] = { 'Submitted': int(submittedJobs), 'Done': int(doneJobs) } result['Value'] = summary return result # If a printed summary is requested statAdj = int(0.5 * self.prodAdj) mStatAdj = int(2.0 * self.prodAdj) totalAdj = int(0.5 * self.prodAdj) exAdj = int(0.5 * self.prodAdj) message = '\nJob Summary for ProductionID %s considering status %s' % ( productionID, status) if minorStatus: message += 'and MinorStatus = %s' % (minorStatus) message += ':\n\n' message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \ 'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n' for stat, metadata in summary.iteritems(): message += '\n' for minor, appInfo in metadata.iteritems(): message += '\n' for appStat, jobInfo in appInfo.iteritems(): message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \ str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n' # self._prettyPrint(summary) if status or minorStatus: return S_OK(summary) result = self.getProductionProgress(productionID) if not result['OK']: self.log.warn('Could not get production progress information') return result if 'Created' in result['Value']: createdJobs = int(result['Value']['Created']) + submittedJobs else: createdJobs = submittedJobs percSub = int(100 * submittedJobs / createdJobs) percDone = int(100 * doneJobs / createdJobs) print '\nCurrent status of production %s:\n' % productionID print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '% ( ' + str(submittedJobs).ljust(7) + \ 'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' print 'Done'.ljust(12) + str(percDone).ljust(3) + '% ( ' + str(doneJobs).ljust(7) + \ 'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' result = S_OK() result['Totals'] = { 'Submitted': int(submittedJobs), 'Created': int(createdJobs), 'Done': int(doneJobs) } result['Value'] = summary # self.pPrint(result) return result