def checkJobStateTransition(jobID, candidateState, currentStatus=None, jobMonitoringClient=None): """Utility to check if a job state transition is allowed""" if not currentStatus: if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient jobMonitoringClient = JobMonitoringClient() res = jobMonitoringClient.getJobsStatus(jobID) if not res["OK"]: return res try: currentStatus = res["Value"][jobID]["Status"] except KeyError: return S_ERROR("Job does not exist") res = JobsStateMachine(currentStatus).getNextState(candidateState) if not res["OK"]: return res # If the JobsStateMachine does not accept the candidate, return an ERROR if candidateState != res["Value"]: gLogger.error( "Job Status Error", "%s can't move from %s to %s" % (jobID, currentStatus, candidateState), ) return S_ERROR("Job state transition not allowed") return S_OK()
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
class WorkflowTasks(TaskBase): """Handles jobs""" def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasksBulk" startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose("Setting job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) try: site = oJob.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJob.workflow.findParameter("JobType").getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter("JOB_ID"): oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if oJob.workflow.findParameter("PRODUCTION_ID"): oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID", ) oJob.setType(jobType) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose("Setting Site: ", str(sites), transID=transID, method=method) seqDict["Site"] = sites seqDict["JobName"] = self._transTaskName(transID, taskID) seqDict["JOB_ID"] = str(taskID).zfill(8) self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # Handle Input Data inputData = paramsDict.get("InputData") if inputData: if isinstance(inputData, six.string_types): inputData = inputData.replace(" ", "").split(";") self._logVerbose("Setting input data to %s" % inputData, transID=transID, method=method) seqDict["InputData"] = inputData elif paramSeqDict.get("InputData") is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logVerbose("Setting %s to %s" % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ "Job": oJob._toXML(), # pylint: disable=protected-access "TransformationID": transID, "TaskID": taskID, "InputData": inputData, }) if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, name, "JDL", "%%(%s)s" % name, name # pylint: disable=protected-access ) for pName, seq in seqDict.items(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.items(): if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData" ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res["OK"]: return res if taskDict: self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict["BulkJobObject"] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasks" startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter("JobType").getValue() templateOK = False getOutputDataTiming = 0.0 for taskID, paramsDict in taskDict.items(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose("Job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter("PRODUCTION_ID"): oJobTemplate._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter("JOB_ID"): oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose("Setting task name to %s" % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue("JOB_ID", str(taskID).zfill(8)) inputData = None self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) paramsDict["TaskObject"] = "" continue else: self._logDebug("Setting Site: ", str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res["OK"]: self._logError("Could not set the site: %s" % res["Message"], transID=transID, method=method) paramsDict["TaskObject"] = "" continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) paramsDict["TaskObject"] = "" if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ "Job": oJob._toXML(), "TransformationID": transID, "TaskID": taskID, "InputData": inputData }) getOutputDataTiming += time.time() if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): oJob._addJDLParameter(name, ";".join(output)) paramsDict["TaskObject"] = oJob if taskDict: self._logVerbose( "Average getOutputData time: %.1f per task" % (getOutputDataTiming / len(taskDict)), transID=transID, method=method, ) self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """Handle Sites and TargetSE in the parameters""" try: sites = ["ANY"] if paramsDict["Site"]: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict["Site"], sepChar=";") except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res["OK"]: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res["Value"] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ["ANY"]: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """set job inputs (+ metadata)""" inputData = paramsDict.get("InputData") transID = paramsDict["TransformationID"] if inputData: self._logVerbose("Setting input data to %s" % inputData, transID=transID, method="_handleInputs") res = oJob.setInputData(inputData) if not res["OK"]: self._logError("Could not set the inputs: %s" % res["Message"], transID=transID, method="_handleInputs") def _handleRest(self, oJob, paramsDict): """add as JDL parameters all the other parameters that are not for inputs or destination""" transID = paramsDict["TransformationID"] for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logDebug("Setting %s to %s" % (paramName, paramValue), transID=transID, method="_handleRest") oJob._addJDLParameter(paramName, paramValue) def _checkSickTransformations(self, transID): """Check if the transformation is in the transformations to be processed at Hospital or Clinic""" transID = int(transID) clinicPath = "Hospital" if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath if "Clinics" in self.opsH.getSections("Hospital").get("Value", []): basePath = os.path.join("Hospital", "Clinics") clinics = self.opsH.getSections(basePath)["Value"] for clinic in clinics: clinicPath = os.path.join(basePath, clinic) if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath return None def _handleHospital(self, oJob, clinicPath): """Optional handle of hospital/clinic jobs""" if not clinicPath: return oJob.setInputDataPolicy("download", dataScheduling=False) # Check first for a clinic, if not it must be the general hospital hospitalSite = self.opsH.getValue( os.path.join(clinicPath, "ClinicSite"), "") hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"), []) # If not found, get the hospital parameters if not hospitalSite: hospitalSite = self.opsH.getValue("Hospital/HospitalSite", "DIRAC.JobDebugger.ch") if not hospitalCEs: hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) oJob.setDestination(hospitalSite) if hospitalCEs: oJob._addJDLParameter("GridCE", hospitalCEs) def __generatePluginObject(self, plugin): """This simply instantiates the TaskManagerPlugin class with the relevant plugin name""" method = "__generatePluginObject" try: plugModule = __import__(self.pluginLocation, globals(), locals(), ["TaskManagerPlugin"]) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, "TaskManagerPlugin")("%s" % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """Get the list of job output LFNs from the provided plugin""" if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance["OK"]: return moduleInstance self.outputDataModule_o = moduleInstance["Value"] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """Submit the tasks""" if "BulkJobObject" in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """Submit jobs in one go with one parametric job""" if not taskDict: return S_OK(taskDict) startTime = time.time() method = "__submitTransformationTasksBulk" oJob = taskDict.pop("BulkJobObject") # we can only do this, once the job has been popped, or we _might_ crash transID = list(taskDict.values())[0]["TransformationID"] if oJob is None: self._logError("no bulk Job object found", transID=transID, method=method) return S_ERROR(ETSUKN, "No bulk job object provided for submission") result = self.submitTaskToExternal(oJob) if not result["OK"]: self._logError("Failed to submit tasks to external", transID=transID, method=method) return result jobIDList = result["Value"] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task["Success"] = False return S_ERROR( ETSUKN, "Submitted less number of jobs than requested tasks") # Get back correspondence with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]["ExternalID"] = jobID taskDict[taskID]["Success"] = True submitted = len(jobIDList) self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """Submit jobs one by one""" method = "__submitTransformationTasks" submitted = 0 failed = 0 startTime = time.time() for task in taskDict.values(): transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res["Message"], transID=transID, method=method) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logError("Failed to submit %d tasks to WMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """Submits a single job (which can be a bulk one) to the WMS.""" if isinstance(job, six.string_types): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", "", x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({"JobName": jobNames}) if not res["OK"]: self._logError( "Failed to get task from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) return res jobNameIDs = {} for wmsID in res["Value"]: res = self.jobMonitoringClient.getJobSummary(int(wmsID)) if not res["OK"]: self._logWarn( "Failed to get task summary from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) else: jobNameIDs[res["Value"]["JobName"]] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = "getSubmittedTaskStatus" if taskDicts: wmsIDs = [ int(taskDict["ExternalID"]) for taskDict in taskDicts if int(taskDict["ExternalID"]) ] transID = taskDicts[0]["TransformationID"] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] updateDict = {} for taskDict in taskDicts: taskID = taskDict["TaskID"] wmsID = int(taskDict["ExternalID"]) if not wmsID: continue oldStatus = taskDict["ExternalStatus"] newStatus = statusDict.get(wmsID, {}).get("Status", "Removed") if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( "Production/Job %d/%d removed from WMS while it is in %s status" % (transID, taskID, oldStatus), transID=transID, method=method, ) newStatus = "Failed" self._logVerbose( "Setting job status for Production/Job %d/%d to %s" % (transID, taskID, newStatus), transID=transID, method=method, ) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = "getSubmittedFileStatus" # All files are from the same transformation transID = fileDicts[0]["TransformationID"] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict["TaskID"]) taskFiles.setdefault(jobName, {})[fileDict["LFN"]] = fileDict["Status"] res = self.updateTransformationReservedTasks(fileDicts) if not res["OK"]: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res["Value"]["NoTasks"] taskNameIDs = res["Value"]["TaskNameIDs"] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].items(): if oldStatus != TransformationFilesStatus.UNUSED: updateDict[lfn] = TransformationFilesStatus.UNUSED res = self.jobMonitoringClient.getJobsStatus(list( taskNameIDs.values())) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] for jobName, wmsID in taskNameIDs.items(): jobStatus = statusDict.get(wmsID, {}).get("Status") newFileStatus = { "Done": TransformationFilesStatus.PROCESSED, "Completed": TransformationFilesStatus.PROCESSED, "Failed": TransformationFilesStatus.UNUSED, }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].items(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
if batchIDs: if batchID not in batchIDs: continue allJobs.add(job) if full or status == [None]: allJobs.add(job) result.setdefault(job, {})['Status'] = status result[job]['Node'] = node result[job]['LocalJobID'] = batchID wnJobs[node] = wnJobs.setdefault(node, 0) + 1 # If necessary get jobs' status statusCounters = {} if allJobs: allJobs = sorted(allJobs, reverse=True) res = monitoring.getJobsStatus(allJobs) if res['OK']: jobStatus = res['Value'] res = monitoring.getJobsMinorStatus(allJobs) if res['OK']: jobMinorStatus = res['Value'] res = monitoring.getJobsApplicationStatus(allJobs) if res['OK']: jobApplicationStatus = res['Value'] if not res['OK']: gLogger.error('Error getting job parameter', res['Message']) else: for job in allJobs: stat = jobStatus.get(job, {}).get('Status', 'Unknown') + '; ' + \ jobMinorStatus.get(job, {}).get('MinorStatus', 'Unknown') + '; ' + \ jobApplicationStatus.get(job, {}).get('ApplicationStatus', 'Unknown')
def test_FullChain(self): """This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue(isinstance(res["Value"], int), msg="Got %s" % type(res["Value"])) self.assertEqual(res["Value"], res["JobID"], msg="Got %s, expected %s" % (str(res["Value"]), res["JobID"])) jobID = res["JobID"] jobID = res["Value"] # updating the status res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "Executing Minchiapp", "source") self.assertTrue(res["OK"], res.get("Message")) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res["OK"], res.get("Message")) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RECEIVED, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "MinorStatus": "Job Rescheduled" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "ApplicationStatus": "Unknown" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsStates([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual( res["Value"], { jobID: { "Status": JobStatus.RECEIVED, "MinorStatus": "Job Rescheduled", "ApplicationStatus": "Unknown" } }, msg="Got %s" % str(res["Value"]), ) # updating the status again res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.WAITING, "waiting", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.MATCHED, "matched", "source") self.assertTrue(res["OK"], res.get("Message")) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.KILLED, msg="Got %s" % str(res["Value"])) # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
def test_JobStateUpdateAndJobMonitoring(self): """Verifying all JobStateUpdate and JobMonitoring functions""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = int(res["Value"]) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobJDL(jobID, False) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsParameters([jobID], []) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobOwner(jobID) self.assertTrue(res["OK"], res.get("Message")) # Adding stuff # forcing the update res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "running", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobParameters(jobID, [("par1", "par1Value"), ("par2", "par2Value")]) time.sleep(5) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobApplicationStatus(jobID, "app status", "source") self.assertTrue(res["OK"], res.get("Message")) # res = jobStateUpdate.setJobFlag() # self.assertTrue(res['OK'], res.get('Message')) # res = jobStateUpdate.unsetJobFlag() # self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobSite(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) # now checking few things res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameter(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {"par1": "par1Value"}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value", "par2": "par2Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobParameters(jobID, "par1") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "par1": "par1Value" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttribute(jobID, "Site") self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], "Site", msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobAttributes(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["JobName"], "helloWorld", msg="Got %s" % str(res["Value"]["JobName"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING, msg="Got %s" % str(res["Value"]["Status"])) res = jobMonitor.getJobHeartBeatData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getInputData(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getAtticJobParameters(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.DONE, "MinorStatus", "Unknown") self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.DONE, msg="Got %s" % str(res["Value"]["Status"])) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus", msg="Got %s" % str(res["Value"]["MinorStatus"])) self.assertEqual(res["Value"]["ApplicationStatus"], "app status", msg="Got %s" % str(res["Value"]["ApplicationStatus"])) res = jobStateUpdate.sendHeartBeat(jobID, {"bih": "bih"}, {"boh": "boh"}) self.assertTrue(res["OK"], res.get("Message")) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def main(): site = 'BOINC.World.org' status = ["Running"] minorStatus = None workerNodes = None since = None date = 'today' full = False until = None batchIDs = None Script.registerSwitch('', 'Site=', ' Select site (default: %s)' % site) Script.registerSwitch('', 'Status=', ' Select status (default: %s)' % status) Script.registerSwitch('', 'MinorStatus=', ' Select minor status') Script.registerSwitch('', 'WorkerNode=', ' Select WN') Script.registerSwitch('', 'BatchID=', ' Select batch jobID') Script.registerSwitch( '', 'Since=', ' Date since when to select jobs, or number of days (default: today)' ) Script.registerSwitch('', 'Date=', ' Specify the date (check for a full day)') Script.registerSwitch( '', 'Full', ' Printout full list of job (default: False except if --WorkerNode)') Script.parseCommandLine() from DIRAC import gLogger from DIRAC.Interfaces.API.Dirac import Dirac from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient switches = Script.getUnprocessedSwitches() for switch in switches: if switch[0] == 'Site': site = switch[1] elif switch[0] == 'MinorStatus': minorStatus = switch[1] elif switch[0] == 'Status': if switch[1].lower() == 'all': status = [None] else: status = switch[1].split(',') elif switch[0] == 'WorkerNode': workerNodes = switch[1].split(',') elif switch[0] == 'BatchID': try: batchIDs = [int(id) for id in switch[1].split(',')] except BaseException: gLogger.error('Invalid jobID', switch[1]) DIRAC.exit(1) elif switch[0] == 'Full': full = True elif switch[0] == 'Date': since = switch[1].split()[0] until = str( datetime.datetime.strptime(since, '%Y-%m-%d') + datetime.timedelta(days=1)).split()[0] elif switch[0] == 'Since': date = switch[1].lower() if date == 'today': since = None elif date == 'yesterday': since = 1 elif date == 'ever': since = 2 * 365 elif date.isdigit(): since = int(date) date += ' days' else: since = date if isinstance(since, int): since = str(datetime.datetime.now() - datetime.timedelta(days=since)).split()[0] if workerNodes or batchIDs: # status = [None] full = True monitoring = JobMonitoringClient() dirac = Dirac() # Get jobs according to selection jobs = set() for stat in status: res = dirac.selectJobs(site=site, date=since, status=stat, minorStatus=minorStatus) if not res['OK']: gLogger.error('Error selecting jobs', res['Message']) DIRAC.exit(1) allJobs = set(int(job) for job in res['Value']) if until: res = dirac.selectJobs(site=site, date=until, status=stat) if not res['OK']: gLogger.error('Error selecting jobs', res['Message']) DIRAC.exit(1) allJobs -= set(int(job) for job in res['Value']) jobs.update(allJobs) if not jobs: gLogger.always('No jobs found...') DIRAC.exit(0) # res = monitoring.getJobsSummary( jobs ) # print eval( res['Value'] )[jobs[0]] allJobs = set() result = {} wnJobs = {} gLogger.always('%d jobs found' % len(jobs)) # Get host name for job in jobs: res = monitoring.getJobParameter(job, 'HostName') node = res.get('Value', {}).get('HostName', 'Unknown') res = monitoring.getJobParameter(job, 'LocalJobID') batchID = res.get('Value', {}).get('LocalJobID', 'Unknown') if workerNodes: if not [wn for wn in workerNodes if node.startswith(wn)]: continue allJobs.add(job) if batchIDs: if batchID not in batchIDs: continue allJobs.add(job) if full or status == [None]: allJobs.add(job) result.setdefault(job, {})['Status'] = status result[job]['Node'] = node result[job]['LocalJobID'] = batchID wnJobs[node] = wnJobs.setdefault(node, 0) + 1 # If necessary get jobs' status statusCounters = {} if allJobs: allJobs = sorted(allJobs, reverse=True) res = monitoring.getJobsStatus(allJobs) if res['OK']: jobStatus = res['Value'] res = monitoring.getJobsMinorStatus(allJobs) if res['OK']: jobMinorStatus = res['Value'] res = monitoring.getJobsApplicationStatus(allJobs) if res['OK']: jobApplicationStatus = res['Value'] if not res['OK']: gLogger.error('Error getting job parameter', res['Message']) else: for job in allJobs: stat = jobStatus.get(job, {}).get('Status', 'Unknown') + '; ' + \ jobMinorStatus.get(job, {}).get('MinorStatus', 'Unknown') + '; ' + \ jobApplicationStatus.get(job, {}).get('ApplicationStatus', 'Unknown') result[job]['Status'] = stat statusCounters[stat] = statusCounters.setdefault(stat, 0) + 1 elif not workerNodes and not batchIDs: allJobs = sorted(jobs, reverse=True) # Print out result if workerNodes or batchIDs: gLogger.always('Found %d jobs at %s, WN %s (since %s):' % (len(allJobs), site, workerNodes, date)) if allJobs: gLogger.always('List of jobs:', ','.join([str(job) for job in allJobs])) else: if status == [None]: gLogger.always('Found %d jobs at %s (since %s):' % (len(allJobs), site, date)) for stat in sorted(statusCounters): gLogger.always('%d jobs %s' % (statusCounters[stat], stat)) else: gLogger.always('Found %d jobs %s at %s (since %s):' % (len(allJobs), status, site, date)) gLogger.always( 'List of WNs:', ','.join([ '%s (%d)' % (node, wnJobs[node]) for node in sorted( wnJobs, cmp=(lambda n1, n2: (wnJobs[n2] - wnJobs[n1]))) ])) if full: if workerNodes or batchIDs: nodeJobs = {} for job in allJobs: status = result[job]['Status'] node = result[job]['Node'].split('.')[0] jobID = result[job].get('LocalJobID') nodeJobs.setdefault(node, []).append((jobID, job, status)) if not workerNodes: workerNodes = sorted(nodeJobs) for node in workerNodes: for job in nodeJobs.get(node.split('.')[0], []): gLogger.always('%s ' % node + '(%s): %s - %s' % job) else: for job in allJobs: status = result[job]['Status'] node = result[job]['Node'] jobID = result[job].get('LocalJobID') gLogger.always('%s (%s): %s - %s' % (node, jobID, job, status))