def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK()
def checkJobStateTransition(jobID, candidateState, currentStatus=None, jobMonitoringClient=None): """Utility to check if a job state transition is allowed""" if not currentStatus: if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient jobMonitoringClient = JobMonitoringClient() res = jobMonitoringClient.getJobsStatus(jobID) if not res["OK"]: return res try: currentStatus = res["Value"][jobID]["Status"] except KeyError: return S_ERROR("Job does not exist") res = JobsStateMachine(currentStatus).getNextState(candidateState) if not res["OK"]: return res # If the JobsStateMachine does not accept the candidate, return an ERROR if candidateState != res["Value"]: gLogger.error( "Job Status Error", "%s can't move from %s to %s" % (jobID, currentStatus, candidateState), ) return S_ERROR("Job state transition not allowed") return S_OK()
def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( )
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def initialize(self): """ Initialize the agent. """ self.am_setOption("PollingTime", 60) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK()
def execute(self): """ Run this. """ if not self.workflowStatus['OK'] or not self.stepStatus['OK']: LOG.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK'])) return S_OK('Workflow status is not OK') result = self.resolveInputVariables() if not result['OK']: LOG.error("Failed to resolve input parameters:", result["Message"]) return result if not self.srmfiles: LOG.error('Files txt was not found correctly: %s' % self.srmfiles) return S_ERROR('Files txt was not found correctly') if not isinstance( self.files[0], dict ): LOG.error('Files were not found correctly: %s' % self.files) return S_ERROR('Files were not found correctly') ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time max_concurrent_running = self.ops.getValue('/GetSRM/MaxConcurrentRunning', 100) error_count = 0 while 1: if error_count > 10 : LOG.error('JobDB Content does not return expected dictionary') return S_ERROR('Failed to get number of concurrent overlay jobs') jobMonitor = JobMonitoringClient() res = jobMonitor.getCurrentJobCounters({'ApplicationStatus':'Downloading SRM files'}) if not res['OK']: error_count += 1 time.sleep(60) continue running = 0 if 'Running' in res['Value']: running = res['Value']['Running'] if running < max_concurrent_running: break else: time.sleep(60) self.setApplicationStatus('Downloading SRM files') for filed in self.files: if 'file' not in filed or 'site' not in filed: LOG.error('Dictionnary does not contain correct keys') return S_ERROR('Dictionnary does not contain correct keys') start = os.getcwd() downloadDir = tempfile.mkdtemp(prefix = 'InputData_%s' % (self.counter), dir = start) os.chdir(downloadDir) storageElement = StorageElement( filed['site'] ) result = storageElement.getFile( filed['file'] ) if result['Value']['Failed']: result = storageElement.getFile( filed['file'] ) os.chdir(start) if result['Value']['Failed']: LOG.error("Failed to get the file from storage:", result['Value']['Failed']) return result self.counter += 1 return S_OK()
def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters(jobIDList, 'OutputSandboxLFN') if not result['OK']: return result osLFNList = result['Value'] if not osLFNList: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNList.iteritems(): lfn = outputSandboxLFNdict['OutputSandboxLFN'] result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result)
def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = 'MonitorAgents' self.setup = "Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.diracLocation = "/opt/dirac/pro" self.sysAdminClient = SystemAdministratorClient(socket.gethostname()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self.errors = list() self.accounting = defaultdict(dict) self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "MonitorAgents on %s" % socket.gethostname()
def initialize(self): """ Initialize the agent. """ self.am_setOption( "PollingTime", 60 ) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK()
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not outputDataModule: self.outputDataModule = gConfig.getValue( "/DIRAC/VOPolicy/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not jobClass: from DIRAC.Interfaces.API.Job import Job self.jobClass = Job else: self.jobClass = jobClass if not opsH: from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations self.opsH = Operations() else: self.opsH = opsH
def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = JobMonitoringClient().getJobParameter( jobID, 'OutputSandboxLFN') if result['OK']: lfn = result['Value'].get('OutputSandboxLFN') if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.error('Error interrogating JobDB: %s' % result['Message']) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn, jobID in lfnDict.items(): result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result)
def deleteJobOversizedSandbox(self, jobIDList): """ Deletes the job oversized sandbox files from storage elements. Creates a request in RMS if not immediately possible. :param list jobIDList: list of job IDs :returns: S_OK/S_ERROR """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters( jobIDList, ["OutputSandboxLFN"]) if not result["OK"]: return result osLFNDict = result["Value"] if not osLFNDict: return S_OK({"Successful": successful, "Failed": failed}) osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1]) self.log.verbose("Deleting oversized sandboxes", osLFNDict) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNDict.items( ): # can be an iterator lfn = outputSandboxLFNdict["OutputSandboxLFN"] result = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not result["OK"]: failed[jobID] = lfn continue if not result["Value"]: failed[jobID] = lfn continue ownerDN = result["Value"]["OwnerDN"] ownerGroup = result["Value"]["OwnerGroup"] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result["OK"]: failed[jobID] = lfn else: successful[jobID] = lfn result = {"Successful": successful, "Failed": failed} return S_OK(result)
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule
class ResetCounters(AgentModule): """ Reset the number of jobs at all sites: some sites are not updated properly, so once in a while it's needed to restore the correct number of jobs. It does not need to be exact, but enough to clear some of the jobs. """ def initialize(self): """ Initialize the agent. """ self.am_setOption("PollingTime", 60) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK() def execute(self): """ This is called by the Agent Reactor """ res = self.ovc.getSites() if not res['OK']: return res sitedict = {} sites = res['Value'] gLogger.info("Will update info for sites %s" % sites) for site in sites: attribdict = { "Site": site, "ApplicationStatus": 'Getting overlay files' } res = self.jobmon.getCurrentJobCounters(attribdict) if not res['OK']: continue if 'Running' in res['Value']: sitedict[site] = res['Value']['Running'] else: sitedict[site] = 0 gLogger.info("Setting new values %s" % sitedict) res = self.ovc.setJobsAtSites(sitedict) if not res['OK']: gLogger.error(res['Message']) return res return S_OK()
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference', 'Unknown') if pilotReference == 'Unknown': # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message'])) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: %s' % (jobID, result['Message'])) return result pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
class ResetCounters ( AgentModule ): """ Reset the number of jobs at all sites: some sites are not updated properly, so once in a while it's needed to restore the correct number of jobs. It does not need to be exact, but enough to clear some of the jobs. """ def initialize(self): """ Initialize the agent. """ self.am_setOption( "PollingTime", 60 ) self.ovc = OverlaySystemClient() self.jobmon = JobMonitoringClient() return S_OK() def execute(self): """ This is called by the Agent Reactor """ res = self.ovc.getSites() if not res['OK']: return res sitedict = {} sites = res['Value'] gLogger.info("Will update info for sites %s" % sites) for site in sites: attribdict = {"Site" : site, "ApplicationStatus": 'Getting overlay files'} res = self.jobmon.getCurrentJobCounters(attribdict) if not res['OK']: continue if res['Value'].has_key('Running'): sitedict[site] = res['Value']['Running'] else: sitedict[site] = 0 gLogger.info("Setting new values %s" % sitedict) res = self.ovc.setJobsAtSites(sitedict) if not res['OK']: gLogger.error(res['Message']) return res return S_OK()
def _getJobPilotStatus(self, jobID): """Get the job pilot status""" result = JobMonitoringClient().getJobParameter(jobID, "Pilot_Reference") if not result["OK"]: return result pilotReference = result["Value"].get("Pilot_Reference", "Unknown") if pilotReference == "Unknown": # There is no pilot reference, hence its status is unknown return S_OK("NoPilot") result = PilotManagerClient().getPilotInfo(pilotReference) if not result["OK"]: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result["Message"])) return S_OK("NoPilot") self.log.error("Failed to get pilot information", "for job %d: %s" % (jobID, result["Message"])) return result pilotStatus = result["Value"][pilotReference]["Status"] return S_OK(pilotStatus)
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """c'tor""" AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ["TransformationDB", "MetadataCatalog"] # # transformation metadata self.transfidmeta = "TransformationID" # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = "LogSE" # # enable/disable execution self.enableFlag = "True" self.dataProcTTypes = ["MCSimulation", "Merge"] self.dataManipTTypes = ["Replication", "Removal"] def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK() ############################################################################# def execute(self): """execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag) if self.enableFlag != "True": self.log.info( "TransformationCleaningAgent is disabled by configuration option EnableFlag" ) return S_OK("Disabled via CS flag") # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({ "Status": "Cleaning", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Failed to get transformations", res["Message"]) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({ "Status": "RemovingFiles", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info( "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations( { "Status": "Completed", "Type": self.transformationTypes }, older=olderThanTime, timeStamp="LastUpdate") if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) return S_OK() def finalize(self): """Only at finalization: will clean ancient transformations (remnants) 1) get the transformation IDs of jobs that are older than 1 year 2) find the status of those transformations. Those "Cleaned" and "Archived" will be cleaned and archived (again) Why doing this here? Basically, it's a race: 1) the production manager submits a transformation 2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue, so eventually during their (long-ish) cycle they'll work on it. 3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason). So, the status is changed to "Cleaning" 4) the TransformationCleaningAgent cleans what has been created (maybe, nothing), then sets the transformation status to "Cleaned" or "Archived" 5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in, creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet). Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent, but these 2 agents are already doing a lot of stuff, and are pretty heavy. So, we should just clean from time to time. What I added here is done only when the agent finalize, and it's quite light-ish operation anyway. """ res = self.jobMonitoringClient.getJobGroups( None, datetime.utcnow() - timedelta(days=365)) if not res["OK"]: self.log.error("Failed to get job groups", res["Message"]) return res transformationIDs = res["Value"] if transformationIDs: res = self.transClient.getTransformations( {"TransformationID": transformationIDs}) if not res["OK"]: self.log.error("Failed to get transformations", res["Message"]) return res transformations = res["Value"] toClean = [] toArchive = [] for transDict in transformations: if transDict["Status"] == "Cleaned": toClean.append(transDict) if transDict["Status"] == "Archived": toArchive.append(transDict) for transDict in toClean: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) for transDict in toArchive: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) # Remove JobIDs that were unknown to the TransformationSystem jobGroupsToCheck = [ str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive ] res = self.jobMonitoringClient.getJobs( {"JobGroup": jobGroupsToCheck}) if not res["OK"]: return res jobIDsToRemove = [int(jobID) for jobID in res["Value"]] res = self.__removeWMSTasks(jobIDsToRemove) if not res["OK"]: return res return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict["Type"] in self.dataManipTTypes: res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) else: res = self.cleanTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems cleaning transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems removing transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose( "Cleaning Transformation directories of transformation %d" % transID) directories = [] if "TransformationDB" in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ["OutputDirectories"]) if not res["OK"]: self.log.error("Failed to obtain transformation directories", res["Message"]) return res transDirectories = [] if res["Value"]: if not isinstance(res["Value"], list): try: transDirectories = ast.literal_eval(res["Value"]) except Exception: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res["Value"]) else: transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if "MetadataCatalog" in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: self.log.error("Failed to obtain metadata catalog directories", res["Message"]) return res transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res["OK"]: return res filesFound = res["Value"] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info( "Attempting to remove possible remnants from the catalog and storage", "(n=%d)" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res realFailure = False for lfn, reason in res["Value"]["Failed"].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info("Obtaining the catalog contents for %d directories:" % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res["OK"] and "Directory does not exist" in res[ "Message"]: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res["OK"]: if "No such file or directory" in res["Message"]: self.log.info("%s: %s" % (currentDir, res["Message"])) else: self.log.error( "Failed to get directory %s content" % currentDir, res["Message"]) else: dirContents = res["Value"] activeDirs.extend(dirContents["SubDirs"]) allFiles.update(dirContents["Files"]) self.log.info("", "Found %d files" % len(allFiles)) return S_OK(list(allFiles)) def cleanTransformationLogFiles(self, directory): """clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory", directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res["OK"]: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res["Message"]) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """This just removes any mention of the output data from the catalog and storage""" self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res)) return S_OK() directories = res["Value"] for directory in directories: if not re.search("/LOG/", directory): res = self.cleanContent(directory) if not res["OK"]: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res self.log.info("Successfully removed output of transformation", transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, "Status", "RemovedFiles") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, "Status", "Archived") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Archived" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation", transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res["Message"])) return S_OK() directories = res["Value"] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the log files for the jobs for directory in directories: if re.search("/LOG/", directory): res = self.cleanTransformationLogFiles(directory) if not res["OK"]: return res res = self.cleanContent(directory) if not res["OK"]: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully cleaned transformation", transID) res = self.transClient.setTransformationParameter( transID, "Status", "Cleaned") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Cleaned" % (transID), res["Message"]) return res self.log.info("Updated status of transformation", "%s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """wipe out files from catalog""" res = self.metadataClient.findFilesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: return res fileToRemove = res["Value"] if not fileToRemove: self.log.info("No files found for transID", transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res for lfn, reason in res["Value"]["Failed"].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res["Value"]["Failed"]: return S_ERROR( "Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the DFC") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation""" self.log.verbose("Cleaning Transformation tasks of transformation", transID) res = self.__getTransformationExternalIDs(transID) if not res["OK"]: return res externalIDs = res["Value"] if externalIDs: res = self.transClient.getTransformationParameters( transID, ["Type"]) if not res["OK"]: self.log.error("Failed to determine transformation type") return res transType = res["Value"] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res["OK"]: return res return S_OK() def __getTransformationExternalIDs(self, transID): """collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict={"TransformationID": transID}) if not res["OK"]: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res["Message"]) return res externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """This will remove requests from the RMS system -""" rIDs = [int(int(j)) for j in requestIDs if int(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res["OK"]: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to kill jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res["OK"]: self.log.info("Successfully deleted jobs from WMS", "(n=%d)" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to delete jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to delete jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False if not allRemove: return S_ERROR("Failed to delete all remnants from WMS") self.log.info("Successfully deleted all tasks from the WMS") if not jobIDs: self.log.info( "JobIDs not present, unable to delete associated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res["OK"]: self.log.error("Failed to get requestID for jobs.", res["Message"]) return res failoverRequests.update(res["Value"]["Successful"]) if not failoverRequests: return S_OK() for jobID, requestID in res["Value"]["Successful"].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == "0": continue res = self.reqClient.cancelRequest(requestID) if not res["OK"]: self.log.error("Failed to remove request from RequestDB", res["Message"]) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed requests", "(n=%d)" % (len(failoverRequests) - failed)) self.log.info("Failed to remove requests", "(n=%d)" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info( "Successfully removed all the associated failover requests") return S_OK()
def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) self.name = 'DataRecoveryAgent' self.enabled = False self.getJobInfoFromJDLOnly = False self.__getCSOptions() self.jobStatus = [ 'Failed', 'Done' ] # This needs to be both otherwise we cannot account for all cases self.jobMon = JobMonitoringClient() self.fcClient = FileCatalogClient() self.tClient = TransformationClient() self.reqClient = ReqClient() self.diracAPI = Dirac() self.inputFilesProcessed = set() self.todo = {'NoInputFiles': [dict(Message="NoInputFiles: OutputExists: Job 'Done'", ShortMessage="NoInputFiles: job 'Done' ", Counter=0, Check=lambda job: job.allFilesExist() and job.status == 'Failed', Actions=lambda job, tInfo: [job.setJobDone(tInfo)], ), dict(Message="NoInputFiles: OutputMissing: Job 'Failed'", ShortMessage="NoInputFiles: job 'Failed' ", Counter=0, Check=lambda job: job.allFilesMissing() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)], ), ], 'InputFiles': [ \ # must always be first! dict(Message="One of many Successful: clean others", ShortMessage="Other Tasks --> Keep", Counter=0, Check=lambda job: job.allFilesExist() and job.otherTasks and \ not set(job.inputFiles).issubset(self.inputFilesProcessed), Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles), job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Other Task processed Input, no Output: Fail", ShortMessage="Other Tasks --> Fail", Counter=0, Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allFilesMissing() and job.status != 'Failed', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Other Task processed Input: Fail and clean", ShortMessage="Other Tasks --> Cleanup", Counter=0, Check=lambda job: set(job.inputFiles).issubset( self.inputFilesProcessed) and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)] ), dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean", ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ not job.allTransFilesDeleted(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo)], ), dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean", ShortMessage="Input Deleted --> Job 'Failed, Cleanup", Counter=0, Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \ job.allTransFilesDeleted() and not job.allFilesMissing(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)], ), # All Output Exists dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed", ShortMessage="Output Exists --> Job Done, Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)] ), dict(Message="Output Exists, job Failed, input Processed --> Job Done", ShortMessage="Output Exists --> Job Done", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setJobDone(tInfo)] ), dict(Message="Output Exists, job Done, input not Processed --> Input Processed", ShortMessage="Output Exists --> Input Processed", Counter=0, Check=lambda job: job.allFilesExist() and \ not job.otherTasks and \ job.status == 'Done' and \ not job.allFilesProcessed() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)] ), # outputmissing dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset", ShortMessage="Max ErrorCount --> Input MaxReset", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist() and \ job.checkErrorCount(), Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)] ), dict(Message="Output Missing, job Failed, input Assigned --> Input Unused", ShortMessage="Output Missing --> Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo)] ), dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused", ShortMessage="Output Missing --> Job Failed, Input Unused", Counter=0, Check=lambda job: job.allFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ not set(job.inputFiles).issubset(self.inputFilesProcessed) and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), # some files missing, needing cleanup. Only checking for # assigned, because processed could mean an earlier job was # succesful and this one is just the duplicate that needed # to be removed! But we check for other tasks earlier, so # this should not happen dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Failed' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)] ), dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'", ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused", Counter=0, Check=lambda job: job.someFilesMissing() and \ not job.otherTasks and \ job.status == 'Done' and \ job.allFilesAssigned() and \ job.allInputFilesExist(), Actions=lambda job, tInfo: [ job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)] ), dict(Message="Some missing, job Done --> job Failed", ShortMessage="Output Missing, Done --> Job Failed", Counter=0, Check=lambda job: not job.allFilesExist() and job.status == 'Done', Actions=lambda job, tInfo: [job.setJobFailed(tInfo)] ), dict(Message="Something Strange", ShortMessage="Strange", Counter=0, Check=lambda job: job.status not in ("Failed", "Done"), Actions=lambda job, tInfo: [] ), # should always be the last one! dict(Message="Failed Hard", ShortMessage="Failed Hard", Counter=0, Check=lambda job: False, # never Actions=lambda job, tInfo: [] ), ] } self.jobCache = defaultdict(lambda: (0, 0)) # Notification options self.notesToSend = "" self.subject = "DataRecoveryAgent" self.startTime = time.time()
def getProductionApplicationSummary(self, productionID, status=None, minorStatus=None, printOutput=False): """Returns an application status summary for the productions in the system. If printOutput is specified, the result is printed to the screen. This queries the WMS for the given productionID and provides an up-to-date snapshot of the application status combinations and associated WMS JobIDs. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') statusDict = self.getProdJobMetadata(productionID, status, minorStatus) if not statusDict['OK']: self.log.warn('Could not get production metadata information') return statusDict jobIDs = list(statusDict['Value']) if not jobIDs: return S_ERROR('No JobIDs with matching conditions found') self.log.verbose('Considering %s jobs with selected conditions' % (len(jobIDs))) # now need to get the application status information result = JobMonitoringClient().getJobsApplicationStatus(jobIDs) if not result['OK']: self.log.warn('Could not get application status for jobs list') return result appStatus = result['Value'] # self._prettyPrint(appStatus) # self._prettyPrint(statusDict['Value']) # Now format the result. summary = {} submittedJobs = 0 doneJobs = 0 for job, atts in statusDict['Value'].iteritems(): for key, val in atts.iteritems(): if key == 'Status': uniqueStatus = val.capitalize() if uniqueStatus not in summary: summary[uniqueStatus] = {} if atts['MinorStatus'] not in summary[uniqueStatus]: summary[uniqueStatus][atts['MinorStatus']] = {} if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['JobList'] = [ job ] else: if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = [job] else: current = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['Total'] summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = current + 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 jobList = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['JobList'] jobList.append(job) summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = jobList if not printOutput: result = S_OK() if not status and not minorStatus: result['Totals'] = { 'Submitted': int(submittedJobs), 'Done': int(doneJobs) } result['Value'] = summary return result # If a printed summary is requested statAdj = int(0.5 * self.prodAdj) mStatAdj = int(2.0 * self.prodAdj) totalAdj = int(0.5 * self.prodAdj) exAdj = int(0.5 * self.prodAdj) message = '\nJob Summary for ProductionID %s considering status %s' % ( productionID, status) if minorStatus: message += 'and MinorStatus = %s' % (minorStatus) message += ':\n\n' message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \ 'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n' for stat, metadata in summary.iteritems(): message += '\n' for minor, appInfo in metadata.iteritems(): message += '\n' for appStat, jobInfo in appInfo.iteritems(): message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \ str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n' # self._prettyPrint(summary) if status or minorStatus: return S_OK(summary) result = self.getProductionProgress(productionID) if not result['OK']: self.log.warn('Could not get production progress information') return result if 'Created' in result['Value']: createdJobs = int(result['Value']['Created']) + submittedJobs else: createdJobs = submittedJobs percSub = int(100 * submittedJobs / createdJobs) percDone = int(100 * doneJobs / createdJobs) print '\nCurrent status of production %s:\n' % productionID print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '% ( ' + str(submittedJobs).ljust(7) + \ 'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' print 'Done'.ljust(12) + str(percDone).ljust(3) + '% ( ' + str(doneJobs).ljust(7) + \ 'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' result = S_OK() result['Totals'] = { 'Submitted': int(submittedJobs), 'Created': int(createdJobs), 'Done': int(doneJobs) } result['Value'] = summary # self.pPrint(result) return result
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(isinstance(res['Value'], int), msg="Got %s" % type(res['Value'])) self.assertEqual(res['Value'], res['JobID'], msg="Got %s, expected %s" % (str(res['Value']), res['JobID'])) jobID = res['JobID'] jobID = res['Value'] # updating the status res = jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') self.assertTrue(res['OK'], res.get('Message')) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res['OK'], res.get('Message')) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Received', msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'MinorStatus': 'Job Rescheduled', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'ApplicationStatus': 'Unknown', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) # updating the status again res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Killed', msg="Got %s" % str(res['Value'])) # updating the status aaaagain res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], 'Done', msg="Got %s" % str(res['Value'])) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Deleted', msg="Got %s" % str(res['Value']))
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
jobs = [] for arg in Script.getPositionalArgs(): try: jobs += [int(job) for job in arg.split(',')] except ValueError: gLogger.fatal("Invalid list of jobIDs") DIRAC.exit(2) from DIRAC.DataManagementSystem.Client.DataManager import DataManager from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient from DIRAC.Core.Utilities.SiteSEMapping import getSEsForSite dm = DataManager() bk = BookkeepingClient() monitoring = JobMonitoringClient() if not jobs: conditions = { 'Status': 'Failed', 'MinorStatus': 'Maximum of reschedulings reached', 'ApplicationStatus': 'Failed Input Data Resolution ' } prStr = 'all jobs' if production: prStr = 'production %s' % ' '.join(production) if len(production) == 1: production = production[0] conditions['JobGroup'] = production if userName: prStr = 'user %s' % userName
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class MonitorAgents(AgentModule): """MonitorAgents class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = 'MonitorAgents' self.setup = "Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.diracLocation = "/opt/dirac/pro" self.sysAdminClient = SystemAdministratorClient(socket.gethostname()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self.errors = list() self.accounting = defaultdict(dict) self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "MonitorAgents on %s" % socket.gethostname() def logError(self, errStr, varMsg=''): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.diracLocation = os.environ.get("DIRAC", self.diracLocation) self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents) self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType='Agents') if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType='Executors') if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType='Services') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not(self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.iteritems(): rows.append([[instanceName], [val.get('Treatment', 'No Treatment')], [str(val.get('LogAge', 'Not Relevant'))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ') if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res['OK']: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType='Agents', runitStatus='Run'): """Return a dict of running agents, executors or services. Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res['Value'][instanceType] runningAgents = defaultdict(dict) for system, agents in val.iteritems(): for agentName, agentInfo in agents.iteritems(): if agentInfo['Setup'] and agentInfo['Installed']: if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus: continue confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName) for option, default in (('PollingTime', HOUR), ('Port', None)): optPath = os.path.join(confPath, option) runningAgents[agentName][option] = gConfig.getValue(optPath, default) runningAgents[agentName]["LogFileLocation"] = \ os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current') runningAgents[agentName]["PID"] = agentInfo["PID"] runningAgents[agentName]['Module'] = agentInfo['Module'] runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus'] runningAgents[agentName]['System'] = system return S_OK(runningAgents) def on_terminate(self, agentName, process): """Execute callback when a process terminates gracefully.""" self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ('executor', 'agent', 'service'): for name, options in getattr(self, instanceType + 's').iteritems(): # call checkAgent, checkExecutor, checkService res = getattr(self, 'check' + instanceType.capitalize())(name, options) if not res['OK']: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message'])) res = self.componentControl() if not res['OK']: if "Stopped does not exist" not in res['Message'] and \ "Running does not exist" not in res['Message']: self.logError("Failure to control components", res['Message']) if not self.errors: res = self.checkURLs() if not res['OK']: self.logError("Failure to check URLs", res['Message']) else: self.logError('Something was wrong before, not checking URLs this time') self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info("Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName]["Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: agentProc = psutil.Process(int(pid)) processesToTerminate = agentProc.children(recursive=True) processesToTerminate.append(agentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial(self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes['OK']: self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message'])) res = self.restartInstance(int(options['PID']), serviceName, self.restartServices) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[serviceName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID'] self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options['LogFileLocation'] pid = options['PID'] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res['OK']: return res if res['OK'] and res['Value'] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {'Status': 'Checking', 'MinorStatus': executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs['OK']: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message'])) return resJobs if resJobs['Value']: self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName)) return S_OK(CHECKING_JOBS) self.log.info("Found no jobs in 'Checking' status for %s" % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent['OK']: return resCurrent currentStatus = resCurrent['Value'] resDefault = self._getDefaultComponentStatus() if not resDefault['OK']: return resDefault defaultStatus = resDefault['Value'] # ensure instances are in the right state shouldBe = {} shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down']) shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run']) shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All']) self._ensureComponentRunning(shouldBe['Run']) self._ensureComponentDown(shouldBe['Down']) for instance in shouldBe['Unknown']: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall['OK']: return resOverall currentStatus = {'Down': set(), 'Run': set(), 'All': set()} informationDict = resOverall['Value'] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = '%s__%s' % (system, instanceName) runitStatus = instanceInfoDict.get('RunitStatus') if runitStatus in ('Run', 'Down'): currentStatus[runitStatus].add(identifier) currentStatus['All'] = currentStatus['Run'] | currentStatus['Down'] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.gethostname() defaultStatus = {'Down': set(), 'Run': set(), 'All': set()} resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running')) resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped')) if not resRunning['OK']: return resRunning if not resStopped['OK']: return resStopped defaultStatus['Run'] = set(resRunning['Value'].keys()) defaultStatus['Down'] = set(resStopped['Value'].keys()) defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down'] if defaultStatus['Run'].intersection(defaultStatus['Down']): self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down']))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res['OK']: self.logError("Failed to start component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was down, started instance" else: self.accounting[instance]["Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res['OK']: self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance]["Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) res = self.getRunningInstances(instanceType='Services', runitStatus='All') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in self.services.iteritems(): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if 'SystemAdministrator' in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result['OK']: self.logError('Commit to CS failed', result['Message']) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options['System'] module = options['Module'] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options['RunitStatus'] wouldHave = 'Would have ' if not self.commitURLs else '' if runitStatus == 'Run' and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == 'Down' and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) @staticmethod def _getURL(serviceName, options): """Return URL for the service.""" system = options['System'] port = options['Port'] host = socket.gethostname() url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName) return url
def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, 'CPUNormalizationFactor') if not result['OK'] or not result['Value']: self.log.error( 'Error getting Job Parameter CPUNormalizationFactor, setting 0', result.get('Message', 'No such value')) cpuNormalization = 0.0 else: cpuNormalization = float( result['Value'].get('CPUNormalizationFactor')) except Exception as e: self.log.exception( "Exception in __sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result
def finalizeRequest(self, requestID, jobID, useCertificates=True): """check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestID: request id :param int jobID: job id """ stateServer = JobStateUpdateClient(useCertificates=useCertificates) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestID) if not res["OK"]: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestID, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestID, res["Value"])) # The request is 'Done', let's update the job status. If we fail, we should re-try later monitorServer = JobMonitoringClient(useCertificates=useCertificates) res = monitorServer.getJobSummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return res elif not res["Value"]: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]["Status"] jobMinorStatus = res["Value"]["MinorStatus"] jobAppStatus = "" newJobStatus = "" if jobStatus == JobStatus.STALLED: # If job is stalled, find the previous status from the logging info res = monitorServer.getJobLoggingInfo(int(jobID)) if not res["OK"]: self.log.error( "finalizeRequest: Failed to get job logging info", "JobID: %d" % jobID) return res # Check the last status was Stalled and get the one before if len(res["Value"] ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED: jobStatus, jobMinorStatus, jobAppStatus = res["Value"][ -2][:3] newJobStatus = jobStatus # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestID) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestID, digest["Message"])) if jobStatus == JobStatus.COMPLETED: # What to do? Depends on what we have in the minorStatus if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS: newJobStatus = JobStatus.DONE elif jobMinorStatus == JobMinorStatus.APP_ERRORS: newJobStatus = JobStatus.FAILED elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION: # If the job has been Killed, set it Killed newJobStatus = JobStatus.KILLED else: self.log.error( "finalizeRequest: Unexpected jobMinorStatus", "for %d (got %s)" % (jobID, jobMinorStatus)) return S_ERROR("Unexpected jobMinorStatus") if newJobStatus: self.log.info( "finalizeRequest: Updating job status", "for %d to '%s/%s'" % (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE), ) else: self.log.info( "finalizeRequest: Updating job minor status", "for %d to '%s' (current status is %s)" % (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus), ) stateUpdate = stateServer.setJobStatus( jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS") if jobAppStatus and stateUpdate["OK"]: stateUpdate = stateServer.setJobApplicationStatus( jobID, jobAppStatus, "RMS") if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]), ) return stateUpdate return S_OK(newJobStatus)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoring( self ): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create a job and check stuff job = helloWorldJob() jobDescription = createFile( job ) # submitting the job. Checking few stuff res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = int ( res['Value'] ) # jobID = res['JobID'] res = jobMonitor.getJobJDL( jobID, True ) self.assert_( res['OK'] ) res = jobMonitor.getJobJDL( jobID, False ) self.assert_( res['OK'] ) # Adding stuff res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' ) self.assert_( res['OK'] ) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite( jobID, 'Site' ) self.assert_( res['OK'] ) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Running' ) res = jobMonitor.getJobParameter( jobID, 'par1' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value'} ) res = jobMonitor.getJobParameters( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} ) res = jobMonitor.getJobAttribute( jobID, 'Site' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Site' ) res = jobMonitor.getJobAttributes( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['JobName'], 'helloWorld' ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) self.assertEqual( res['Value']['Status'], 'Running' ) res = jobMonitor.getJobHeartBeatData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getInputData( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], [] ) res = jobMonitor.getJobPrimarySummary( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getAtticJobParameters( jobID ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' ) self.assert_( res['OK'] ) res = jobMonitor.getJobSummary( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Status'], 'Done' ) self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' ) self.assertEqual( res['Value']['ApplicationStatus'], 'app status' ) res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} ) self.assert_( res['OK'] ) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob( jobID )
def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def test_JobStateUpdateAndJobMonitoringMultuple( self ): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination( dest ) job.setInputData( lfns ) job.setType( jobType ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] jobIDs.append( jobID ) res = jobMonitor.getSites() self.assert_( res['OK'] ) self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) ) res = jobMonitor.getJobTypes() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( types ) ) res = jobMonitor.getApplicationStates() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) ) res = jobMonitor.getOwners() self.assert_( res['OK'] ) res = jobMonitor.getOwnerGroup() self.assert_( res['OK'] ) res = jobMonitor.getProductionIds() self.assert_( res['OK'] ) res = jobMonitor.getJobGroups() self.assert_( res['OK'] ) res = jobMonitor.getStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] ) res = jobMonitor.getMinorStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] ) self.assert_( res['OK'] ) res = jobMonitor.getJobs() self.assert_( res['OK'] ) self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) ) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_( res['OK'] ) try: self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) ) except TypeError: pass res = jobMonitor.getJobsSummary( jobIDs ) self.assert_( res['OK'] ) res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown'}} ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} ) self.assert_( res['OK'] ) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob( jobIDs )
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') # reset the job res = wmsClient.resetJob(jobID) self.assert_(res['OK']) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Received') # updating the status again jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Killed') # updating the status aaaagain jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Done') # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Deleted')
def test_FullChain( self ): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) # create the job job = helloWorldJob() jobDescription = createFile( job ) # submit the job res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus( jobID, 'Running', 'Executing Minchiapp', 'source' ) # reset the job res = wmsClient.resetJob( jobID ) self.assert_( res['OK'] ) # reschedule the job res = wmsClient.rescheduleJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Received' ) # updating the status again jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Killed' ) # updating the status aaaagain jobStateUpdate.setJobStatus( jobID, 'Done', 'matching', 'source' ) # kill the job res = wmsClient.killJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Done' ) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob( jobID ) self.assert_( res['OK'] ) res = jobMonitor.getJobStatus( jobID ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], 'Deleted' )
import time __RCSID__ = "$Id$" from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient # sut from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient from DIRAC.tests.Integration.WorkloadManagementSystem.Test_Client_WMS import helloWorldJob, createFile jobMonitoringClient = JobMonitoringClient() jobStateUpdateClient = JobStateUpdateClient() def createJob(): job = helloWorldJob() jobDescription = createFile(job) wmsClient = WMSClient() res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) assert res['OK'], res['Message'] jobID = int(res['Value']) return jobID
def __init__(self): TaskBase.__init__(self) self.submissionClient = WMSClient() self.jobMonitoringClient = JobMonitoringClient()