class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasksBulk' startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} paramsDict['JobType'] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID, method=method) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: if isinstance(inputData, basestring): inputData = inputData.replace(' ', '').split(';') self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData }) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData' ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res['OK']: return res if taskDict: self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter('Site').getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if site is not None: paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) paramsDict['TaskObject'] = '' continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [ int(x) for x in self.opsH.getValue("Hospital/Transformations", []) ] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ 'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData }) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='_handleInputs') res = oJob.setInputData(inputData) if not res['OK']: self._logError("Could not set the inputs: %s" % res['Message'], transID=transID, method='_handleInputs') def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='_handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ method = '__generatePluginObject' try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() method = '__submitTransformationTasksBulk' oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method=method) return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: self._logError('Failed to submit tasks to external', transID=transID, method=method) return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR( ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = '__submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job (which can be a bulk one) to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = 'getSubmittedTaskStatus' if taskDicts: wmsIDs = [ int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID']) ] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( 'Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID, method=method) newStatus = "Failed" self._logVerbose( 'Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID, method=method) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = 'getSubmittedFileStatus' # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = { 'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused' }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class WorkflowTasks(TaskBase): """ Handles jobs """ def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='', ownerDN='', bulkSubmissionFlag=False): """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param transBody: transformation job template :param taskDict: dictionary of per task parameters :param owner: owner of the transformation :param ownerGroup: group of the owner of the transformation :param ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res['OK']: return res proxyInfo = res['Value'] owner = proxyInfo['username'] ownerGroup = proxyInfo['group'] if not ownerDN: res = getDNForUsername(owner) if not res['OK']: return res ownerDN = res['Value'][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a single job object for bulk submission """ if taskDict: transID = taskDict.values()[0]['TransformationID'] else: return S_OK({}) # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) method = 'prepareTransformationTasksBulk' self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) jobType = oJob.workflow.findParameter('JobType').getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter('JOB_ID'): oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") if oJob.workflow.findParameter('PRODUCTION_ID'): oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") oJob.setType(jobType) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]: self._handleHospital(oJob) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose('Setting Site: ', str(sites), transID=transID) seqDict['Site'] = sites seqDict['JobName'] = self._transTaskName(transID, taskID) seqDict['JOB_ID'] = str(taskID).zfill(8) self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # Handle Input Data inputData = paramsDict.get('InputData') if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method=method) seqDict['InputData'] = inputData elif paramSeqDict.get('InputData') is not None: self._logError("Invalid mixture of jobs with and without input data") return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logVerbose('Setting %s to %s' % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, # pylint: disable=protected-access 'TaskID': taskID, 'InputData': inputData}) if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter(oJob.workflow, # pylint: disable=protected-access name, 'JDL', "%%(%s)s" % name, name) for pName, seq in seqDict.iteritems(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.iteritems(): if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList: oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: oJob.setParameterSequence(paramName, paramSeq) taskDict['BulkJobObject'] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """ Prepare transformation tasks with a job object per task """ method = '__prepareTasks' startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) site = oJobTemplate.workflow.findParameter('Site').getValue() jobType = oJobTemplate.workflow.findParameter('JobType').getValue() templateOK = False getOutputDataTiming = 0. for taskID, paramsDict in taskDict.iteritems(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information transID = paramsDict['TransformationID'] self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose('Adding default transformation group of %s' % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter('PRODUCTION_ID'): oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, 'PRODUCTION_ID', 'string', str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter('JOB_ID'): oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID") paramsDict['Site'] = site paramsDict['JobType'] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose('Setting task name to %s' % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue('JOB_ID', str(taskID).zfill(8)) inputData = None self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)), transID=transID, method=method) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError('Could not get a list a sites', transID=transID, method=method) paramsDict['TaskObject'] = '' continue else: self._logDebug('Setting Site: ', str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res['OK']: self._logError('Could not set the site: %s' % res['Message'], transID=transID, method=method) continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])] if int(transID) in hospitalTrans: self._handleHospital(oJob) paramsDict['TaskObject'] = '' if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID, 'TaskID': taskID, 'InputData': inputData}) getOutputDataTiming += time.time() if not res['OK']: self._logError("Failed to generate output data", res['Message'], transID=transID, method=method) continue for name, output in res['Value'].iteritems(): oJob._addJDLParameter(name, ';'.join(output)) paramsDict['TaskObject'] = oJob if taskDict: self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)), transID=transID, method=method) self._logInfo('Prepared %d tasks' % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """ Handle Sites and TargetSE in the parameters """ try: sites = ['ANY'] if paramsDict['Site']: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict['Site'], sepChar=';') except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res['OK']: self._logFatal("Could not generate a destination plugin object") return res destinationPlugin_o = res['Value'] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ['ANY']: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """ set job inputs (+ metadata) """ inputData = paramsDict.get('InputData') transID = paramsDict['TransformationID'] if inputData: self._logVerbose('Setting input data to %s' % inputData, transID=transID, method='handleInputs') oJob.setInputData(inputData) def _handleRest(self, oJob, paramsDict): """ add as JDL parameters all the other parameters that are not for inputs or destination """ transID = paramsDict['TransformationID'] for paramName, paramValue in paramsDict.iteritems(): if paramName not in ('InputData', 'Site', 'TargetSE'): if paramValue: self._logDebug('Setting %s to %s' % (paramName, paramValue), transID=transID, method='handleRest') oJob._addJDLParameter(paramName, paramValue) def _handleHospital(self, oJob): """ Optional handle of hospital jobs """ oJob.setType('Hospital') oJob.setInputDataPolicy('download', dataScheduling=False) hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch') oJob.setDestination(hospitalSite) hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) if hospitalCEs: oJob._addJDLParameter('GridCE', hospitalCEs) def __generatePluginObject(self, plugin): """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name """ try: plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin']) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e)) return S_ERROR() try: plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e)) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """ Get the list of job output LFNs from the provided plugin """ if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance['OK']: return moduleInstance self.outputDataModule_o = moduleInstance['Value'] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """ Submit the tasks """ if 'BulkJobObject' in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """ Submit jobs in one go with one parametric job """ if not taskDict: return S_OK(taskDict) startTime = time.time() oJob = taskDict.pop('BulkJobObject') # we can only do this, once the job has been popped, or we _might_ crash transID = taskDict.values()[0]['TransformationID'] if oJob is None: self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk') return S_ERROR(ETSUKN, 'No bulk job object provided for submission') result = self.submitTaskToExternal(oJob) if not result['OK']: return result jobIDList = result['Value'] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task['Success'] = False return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks') # Get back correspondance with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]['ExternalID'] = jobID taskDict[taskID]['Success'] = True submitted = len(jobIDList) self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method='submitTransformationTasksBulk') return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """ Submit jobs one by one """ method = 'submitTransformationTasks' submitted = 0 failed = 0 startTime = time.time() for task in taskDict.itervalues(): transID = task['TransformationID'] if not task['TaskObject']: task['Success'] = False failed += 1 continue res = self.submitTaskToExternal(task['TaskObject']) if res['OK']: task['ExternalID'] = res['Value'] task['Success'] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res['Message'], transID=transID, method=method) task['Success'] = False failed += 1 if submitted: self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime), transID=transID, method=method) if failed: self._logError('Failed to submit %d tasks to WMS.' % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """ Submits a single job to the WMS. """ if isinstance(job, basestring): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", '', x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO.StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts] res = self.jobMonitoringClient.getJobs({'JobName': jobNames}) if not res['OK']: self._logError("Failed to get task from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') return res jobNameIDs = {} for wmsID in res['Value']: res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID)) if not res['OK']: self._logWarn("Failed to get task summary from WMS", res['Message'], transID=transID, method='updateTransformationReservedTasks') else: jobNameIDs[res['Value']['JobName']] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ if taskDicts: wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])] transID = taskDicts[0]['TransformationID'] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] updateDict = {} for taskDict in taskDicts: taskID = taskDict['TaskID'] wmsID = int(taskDict['ExternalID']) if not wmsID: continue oldStatus = taskDict['ExternalStatus'] newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed') if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' % (transID, taskID, oldStatus), transID=transID) newStatus = "Failed" self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus), transID=transID) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) # All files are from the same transformation transID = fileDicts[0]['TransformationID'] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict['TaskID']) taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status'] res = self.updateTransformationReservedTasks(fileDicts) if not res['OK']: self._logWarn("Failed to obtain taskIDs for files", transID=transID) return res noTasks = res['Value']['NoTasks'] taskNameIDs = res['Value']['TaskNameIDs'] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].iteritems(): if oldStatus != 'Unused': updateDict[lfn] = 'Unused' res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values()) if not res['OK']: self._logWarn("Failed to get job status from the WMS system", transID=transID) return res statusDict = res['Value'] for jobName, wmsID in taskNameIDs.iteritems(): jobStatus = statusDict.get(wmsID, {}).get('Status') newFileStatus = {'Done': 'Processed', 'Completed': 'Processed', 'Failed': 'Unused'}.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].iteritems(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple( self ): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination( dest ) job.setInputData( lfns ) job.setType( jobType ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] jobIDs.append( jobID ) res = jobMonitor.getSites() self.assert_( res['OK'] ) self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) ) res = jobMonitor.getJobTypes() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( types ) ) res = jobMonitor.getApplicationStates() self.assert_( res['OK'] ) self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) ) res = jobMonitor.getOwners() self.assert_( res['OK'] ) res = jobMonitor.getOwnerGroup() self.assert_( res['OK'] ) res = jobMonitor.getProductionIds() self.assert_( res['OK'] ) res = jobMonitor.getJobGroups() self.assert_( res['OK'] ) res = jobMonitor.getStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] ) res = jobMonitor.getMinorStates() self.assert_( res['OK'] ) self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] ) self.assert_( res['OK'] ) res = jobMonitor.getJobs() self.assert_( res['OK'] ) self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) ) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_( res['OK'] ) try: self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) ) except TypeError: pass res = jobMonitor.getJobsSummary( jobIDs ) self.assert_( res['OK'] ) res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown'}} ) self.assert_( res['OK'] ) res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} ) self.assert_( res['OK'] ) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob( jobIDs )
class MonitorAgents(AgentModule): """MonitorAgents class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = 'MonitorAgents' self.setup = "Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.diracLocation = "/opt/dirac/pro" self.sysAdminClient = SystemAdministratorClient(socket.gethostname()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self.errors = list() self.accounting = defaultdict(dict) self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "MonitorAgents on %s" % socket.gethostname() def logError(self, errStr, varMsg=''): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.diracLocation = os.environ.get("DIRAC", self.diracLocation) self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents) self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType='Agents') if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType='Executors') if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType='Services') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not(self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.iteritems(): rows.append([[instanceName], [val.get('Treatment', 'No Treatment')], [str(val.get('LogAge', 'Not Relevant'))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ') if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res['OK']: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType='Agents', runitStatus='Run'): """Return a dict of running agents, executors or services. Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res['Value'][instanceType] runningAgents = defaultdict(dict) for system, agents in val.iteritems(): for agentName, agentInfo in agents.iteritems(): if agentInfo['Setup'] and agentInfo['Installed']: if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus: continue confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName) for option, default in (('PollingTime', HOUR), ('Port', None)): optPath = os.path.join(confPath, option) runningAgents[agentName][option] = gConfig.getValue(optPath, default) runningAgents[agentName]["LogFileLocation"] = \ os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current') runningAgents[agentName]["PID"] = agentInfo["PID"] runningAgents[agentName]['Module'] = agentInfo['Module'] runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus'] runningAgents[agentName]['System'] = system return S_OK(runningAgents) def on_terminate(self, agentName, process): """Execute callback when a process terminates gracefully.""" self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ('executor', 'agent', 'service'): for name, options in getattr(self, instanceType + 's').iteritems(): # call checkAgent, checkExecutor, checkService res = getattr(self, 'check' + instanceType.capitalize())(name, options) if not res['OK']: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message'])) res = self.componentControl() if not res['OK']: if "Stopped does not exist" not in res['Message'] and \ "Running does not exist" not in res['Message']: self.logError("Failure to control components", res['Message']) if not self.errors: res = self.checkURLs() if not res['OK']: self.logError("Failure to check URLs", res['Message']) else: self.logError('Something was wrong before, not checking URLs this time') self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info("Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName]["Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: agentProc = psutil.Process(int(pid)) processesToTerminate = agentProc.children(recursive=True) processesToTerminate.append(agentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial(self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes['OK']: self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message'])) res = self.restartInstance(int(options['PID']), serviceName, self.restartServices) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[serviceName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID'] self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options['LogFileLocation'] pid = options['PID'] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res['OK']: return res if res['OK'] and res['Value'] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {'Status': 'Checking', 'MinorStatus': executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs['OK']: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message'])) return resJobs if resJobs['Value']: self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName)) return S_OK(CHECKING_JOBS) self.log.info("Found no jobs in 'Checking' status for %s" % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent['OK']: return resCurrent currentStatus = resCurrent['Value'] resDefault = self._getDefaultComponentStatus() if not resDefault['OK']: return resDefault defaultStatus = resDefault['Value'] # ensure instances are in the right state shouldBe = {} shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down']) shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run']) shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All']) self._ensureComponentRunning(shouldBe['Run']) self._ensureComponentDown(shouldBe['Down']) for instance in shouldBe['Unknown']: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall['OK']: return resOverall currentStatus = {'Down': set(), 'Run': set(), 'All': set()} informationDict = resOverall['Value'] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = '%s__%s' % (system, instanceName) runitStatus = instanceInfoDict.get('RunitStatus') if runitStatus in ('Run', 'Down'): currentStatus[runitStatus].add(identifier) currentStatus['All'] = currentStatus['Run'] | currentStatus['Down'] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.gethostname() defaultStatus = {'Down': set(), 'Run': set(), 'All': set()} resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running')) resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped')) if not resRunning['OK']: return resRunning if not resStopped['OK']: return resStopped defaultStatus['Run'] = set(resRunning['Value'].keys()) defaultStatus['Down'] = set(resStopped['Value'].keys()) defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down'] if defaultStatus['Run'].intersection(defaultStatus['Down']): self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down']))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res['OK']: self.logError("Failed to start component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was down, started instance" else: self.accounting[instance]["Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res['OK']: self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance]["Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) res = self.getRunningInstances(instanceType='Services', runitStatus='All') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in self.services.iteritems(): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if 'SystemAdministrator' in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result['OK']: self.logError('Commit to CS failed', result['Message']) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options['System'] module = options['Module'] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options['RunitStatus'] wouldHave = 'Would have ' if not self.commitURLs else '' if runitStatus == 'Run' and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == 'Down' and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) @staticmethod def _getURL(serviceName, options): """Return URL for the service.""" system = options['System'] port = options['Port'] host = socket.gethostname() url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName) return url
conditions = { 'Status': 'Failed', 'MinorStatus': 'Maximum of reschedulings reached', 'ApplicationStatus': 'Failed Input Data Resolution ' } prStr = 'all jobs' if production: prStr = 'production %s' % ' '.join(production) if len(production) == 1: production = production[0] conditions['JobGroup'] = production if userName: prStr = 'user %s' % userName conditions['Owner'] = userName gLogger.always('Obtaining IDR jobs for %s' % prStr) res = monitoring.getJobs(conditions) if not res['OK']: gLogger.always( 'Error selecting jobs for production %s' % str(production), res['Message']) DIRAC.exit(2) if not res['Value']: gLogger.always("No jobs found with IDR for production %s" % str(production)) elif verbose: gLogger.always('Selected %d jobs from production %s' % (len(res['Value']), str(production))) jobs = [int(job) for job in res['Value']] gLogger.always("Obtained %d jobs... Now analyzing them" % len(jobs)) if not jobs: gLogger.always('No jobs to check, exiting...')
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class TransformationCleaningAgent(AgentModule): """ .. class:: TransformationCleaningAgent :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance :param ~TransformationClient.TransformationClient transClient: TransformationClient instance :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance """ def __init__(self, *args, **kwargs): """c'tor""" AgentModule.__init__(self, *args, **kwargs) self.shifterProxy = None # # transformation client self.transClient = None # # wms client self.wmsClient = None # # request client self.reqClient = None # # file catalog client self.metadataClient = None # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = ["TransformationDB", "MetadataCatalog"] # # transformation metadata self.transfidmeta = "TransformationID" # # archive periof in days self.archiveAfter = 7 # # transformation log SEs self.logSE = "LogSE" # # enable/disable execution self.enableFlag = "True" self.dataProcTTypes = ["MCSimulation", "Merge"] self.dataManipTTypes = ["Replication", "Removal"] def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK() ############################################################################# def execute(self): """execution in one agent's cycle :param self: self reference """ self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag) if self.enableFlag != "True": self.log.info( "TransformationCleaningAgent is disabled by configuration option EnableFlag" ) return S_OK("Disabled via CS flag") # Obtain the transformations in Cleaning status and remove any mention of the jobs/files res = self.transClient.getTransformations({ "Status": "Cleaning", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Failed to get transformations", res["Message"]) # Obtain the transformations in RemovingFiles status and removes the output files res = self.transClient.getTransformations({ "Status": "RemovingFiles", "Type": self.transformationTypes }) if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeRemoval(transDict) else: self.log.info( "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeRemoval)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) # Obtain the transformations in Completed status and archive if inactive for X days olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter) res = self.transClient.getTransformations( { "Status": "Completed", "Type": self.transformationTypes }, older=olderThanTime, timeStamp="LastUpdate") if res["OK"]: for transDict in res["Value"]: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) else: self.log.error("Could not get the transformations", res["Message"]) return S_OK() def finalize(self): """Only at finalization: will clean ancient transformations (remnants) 1) get the transformation IDs of jobs that are older than 1 year 2) find the status of those transformations. Those "Cleaned" and "Archived" will be cleaned and archived (again) Why doing this here? Basically, it's a race: 1) the production manager submits a transformation 2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue, so eventually during their (long-ish) cycle they'll work on it. 3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason). So, the status is changed to "Cleaning" 4) the TransformationCleaningAgent cleans what has been created (maybe, nothing), then sets the transformation status to "Cleaned" or "Archived" 5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in, creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet). Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent, but these 2 agents are already doing a lot of stuff, and are pretty heavy. So, we should just clean from time to time. What I added here is done only when the agent finalize, and it's quite light-ish operation anyway. """ res = self.jobMonitoringClient.getJobGroups( None, datetime.utcnow() - timedelta(days=365)) if not res["OK"]: self.log.error("Failed to get job groups", res["Message"]) return res transformationIDs = res["Value"] if transformationIDs: res = self.transClient.getTransformations( {"TransformationID": transformationIDs}) if not res["OK"]: self.log.error("Failed to get transformations", res["Message"]) return res transformations = res["Value"] toClean = [] toArchive = [] for transDict in transformations: if transDict["Status"] == "Cleaned": toClean.append(transDict) if transDict["Status"] == "Archived": toArchive.append(transDict) for transDict in toClean: if self.shifterProxy: self._executeClean(transDict) else: self.log.info( "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeClean)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) for transDict in toArchive: if self.shifterProxy: self._executeArchive(transDict) else: self.log.info( "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" % transDict) executeWithUserProxy(self._executeArchive)( transDict, proxyUserDN=transDict["AuthorDN"], proxyUserGroup=transDict["AuthorGroup"]) # Remove JobIDs that were unknown to the TransformationSystem jobGroupsToCheck = [ str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive ] res = self.jobMonitoringClient.getJobs( {"JobGroup": jobGroupsToCheck}) if not res["OK"]: return res jobIDsToRemove = [int(jobID) for jobID in res["Value"]] res = self.__removeWMSTasks(jobIDsToRemove) if not res["OK"]: return res return S_OK() def _executeClean(self, transDict): """Clean transformation.""" # if transformation is of type `Replication` or `Removal`, there is nothing to clean. # We just archive if transDict["Type"] in self.dataManipTTypes: res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) else: res = self.cleanTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems cleaning transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeRemoval(self, transDict): """Remove files from given transformation.""" res = self.removeTransformationOutput(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems removing transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) def _executeArchive(self, transDict): """Archive the given transformation.""" res = self.archiveTransformation(transDict["TransformationID"]) if not res["OK"]: self.log.error( "Problems archiving transformation", "%s: %s" % (transDict["TransformationID"], res["Message"])) return S_OK() ############################################################################# # # Get the transformation directories for checking # def getTransformationDirectories(self, transID): """get the directories for the supplied transformation from the transformation system. These directories are used by removeTransformationOutput and cleanTransformation for removing output. :param self: self reference :param int transID: transformation ID """ self.log.verbose( "Cleaning Transformation directories of transformation %d" % transID) directories = [] if "TransformationDB" in self.directoryLocations: res = self.transClient.getTransformationParameters( transID, ["OutputDirectories"]) if not res["OK"]: self.log.error("Failed to obtain transformation directories", res["Message"]) return res transDirectories = [] if res["Value"]: if not isinstance(res["Value"], list): try: transDirectories = ast.literal_eval(res["Value"]) except Exception: # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]' transDirectories.append(res["Value"]) else: transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if "MetadataCatalog" in self.directoryLocations: res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: self.log.error("Failed to obtain metadata catalog directories", res["Message"]) return res transDirectories = res["Value"] directories = self._addDirs(transID, transDirectories, directories) if not directories: self.log.info("No output directories found") directories = sorted(directories) return S_OK(directories) @classmethod def _addDirs(cls, transID, newDirs, existingDirs): """append unique :newDirs: list to :existingDirs: list :param self: self reference :param int transID: transformationID :param list newDirs: src list of paths :param list existingDirs: dest list of paths """ for folder in newDirs: transStr = str(transID).zfill(8) if re.search(transStr, str(folder)): if folder not in existingDirs: existingDirs.append(os.path.normpath(folder)) return existingDirs ############################################################################# # # These are the methods for performing the cleaning of catalogs and storage # def cleanContent(self, directory): """wipe out everything from catalog under folder :directory: :param self: self reference :params str directory: folder name """ self.log.verbose("Cleaning Catalog contents") res = self.__getCatalogDirectoryContents([directory]) if not res["OK"]: return res filesFound = res["Value"] if not filesFound: self.log.info( "No files are registered in the catalog directory %s" % directory) return S_OK() self.log.info( "Attempting to remove possible remnants from the catalog and storage", "(n=%d)" % len(filesFound)) # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(filesFound, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res realFailure = False for lfn, reason in res["Value"]["Failed"].items(): if "File does not exist" in str(reason): self.log.warn("File %s not found in some catalog: " % (lfn)) else: self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason)) realFailure = True if realFailure: return S_ERROR("Failed to remove all files found in the catalog") return S_OK() def __getCatalogDirectoryContents(self, directories): """get catalog contents under paths :directories: :param self: self reference :param list directories: list of paths in catalog """ self.log.info("Obtaining the catalog contents for %d directories:" % len(directories)) for directory in directories: self.log.info(directory) activeDirs = directories allFiles = {} fc = FileCatalog() while activeDirs: currentDir = activeDirs[0] res = returnSingleResult(fc.listDirectory(currentDir)) activeDirs.remove(currentDir) if not res["OK"] and "Directory does not exist" in res[ "Message"]: # FIXME: DFC should return errno self.log.info("The supplied directory %s does not exist" % currentDir) elif not res["OK"]: if "No such file or directory" in res["Message"]: self.log.info("%s: %s" % (currentDir, res["Message"])) else: self.log.error( "Failed to get directory %s content" % currentDir, res["Message"]) else: dirContents = res["Value"] activeDirs.extend(dirContents["SubDirs"]) allFiles.update(dirContents["Files"]) self.log.info("", "Found %d files" % len(allFiles)) return S_OK(list(allFiles)) def cleanTransformationLogFiles(self, directory): """clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory", directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res["OK"]: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res["Message"]) return res self.log.info("Successfully removed transformation log directory") return S_OK() ############################################################################# # # These are the functional methods for archiving and cleaning transformations # def removeTransformationOutput(self, transID): """This just removes any mention of the output data from the catalog and storage""" self.log.info("Removing output data for transformation %s" % transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res)) return S_OK() directories = res["Value"] for directory in directories: if not re.search("/LOG/", directory): res = self.cleanContent(directory) if not res["OK"]: return res self.log.info("Removed %d directories from the catalog \ and its files from the storage for transformation %s" % (len(directories), transID)) # Clean ALL the possible remnants found in the metadata catalog res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res self.log.info("Successfully removed output of transformation", transID) # Change the status of the transformation to RemovedFiles res = self.transClient.setTransformationParameter( transID, "Status", "RemovedFiles") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to RemovedFiles" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to RemovedFiles" % (transID)) return S_OK() def archiveTransformation(self, transID): """This just removes job from the jobDB and the transformation DB :param self: self reference :param int transID: transformation ID """ self.log.info("Archiving transformation %s" % transID) # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully archived transformation %d" % transID) # Change the status of the transformation to archived res = self.transClient.setTransformationParameter( transID, "Status", "Archived") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Archived" % (transID), res["Message"]) return res self.log.info("Updated status of transformation %s to Archived" % (transID)) return S_OK() def cleanTransformation(self, transID): """This removes what was produced by the supplied transformation, leaving only some info and log in the transformation DB. """ self.log.info("Cleaning transformation", transID) res = self.getTransformationDirectories(transID) if not res["OK"]: self.log.error("Problem obtaining directories for transformation", "%s with result '%s'" % (transID, res["Message"])) return S_OK() directories = res["Value"] # Clean the jobs in the WMS and any failover requests found res = self.cleanTransformationTasks(transID) if not res["OK"]: return res # Clean the log files for the jobs for directory in directories: if re.search("/LOG/", directory): res = self.cleanTransformationLogFiles(directory) if not res["OK"]: return res res = self.cleanContent(directory) if not res["OK"]: return res # Clean ALL the possible remnants found res = self.cleanMetadataCatalogFiles(transID) if not res["OK"]: return res # Clean the transformation DB of the files and job information res = self.transClient.cleanTransformation(transID) if not res["OK"]: return res self.log.info("Successfully cleaned transformation", transID) res = self.transClient.setTransformationParameter( transID, "Status", "Cleaned") if not res["OK"]: self.log.error( "Failed to update status of transformation %s to Cleaned" % (transID), res["Message"]) return res self.log.info("Updated status of transformation", "%s to Cleaned" % (transID)) return S_OK() def cleanMetadataCatalogFiles(self, transID): """wipe out files from catalog""" res = self.metadataClient.findFilesByMetadata( {self.transfidmeta: transID}) if not res["OK"]: return res fileToRemove = res["Value"] if not fileToRemove: self.log.info("No files found for transID", transID) return S_OK() # Executing with shifter proxy gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") res = DataManager().removeFile(fileToRemove, force=True) gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not res["OK"]: return res for lfn, reason in res["Value"]["Failed"].items(): self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason)) if res["Value"]["Failed"]: return S_ERROR( "Failed to remove all files found in the metadata catalog") self.log.info("Successfully removed all files found in the DFC") return S_OK() ############################################################################# # # These are the methods for removing the jobs from the WMS and transformation DB # def cleanTransformationTasks(self, transID): """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation""" self.log.verbose("Cleaning Transformation tasks of transformation", transID) res = self.__getTransformationExternalIDs(transID) if not res["OK"]: return res externalIDs = res["Value"] if externalIDs: res = self.transClient.getTransformationParameters( transID, ["Type"]) if not res["OK"]: self.log.error("Failed to determine transformation type") return res transType = res["Value"] if transType in self.dataProcTTypes: res = self.__removeWMSTasks(externalIDs) else: res = self.__removeRequests(externalIDs) if not res["OK"]: return res return S_OK() def __getTransformationExternalIDs(self, transID): """collect all ExternalIDs for transformation :transID: :param self: self reference :param int transID: transforamtion ID """ res = self.transClient.getTransformationTasks( condDict={"TransformationID": transID}) if not res["OK"]: self.log.error( "Failed to get externalIDs for transformation %d" % transID, res["Message"]) return res externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]] self.log.info("Found %d tasks for transformation" % len(externalIDs)) return S_OK(externalIDs) def __removeRequests(self, requestIDs): """This will remove requests from the RMS system -""" rIDs = [int(int(j)) for j in requestIDs if int(j)] for reqID in rIDs: self.reqClient.cancelRequest(reqID) return S_OK() def __removeWMSTasks(self, transJobIDs): """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system :param self: self reference :param list trasnJobIDs: job IDs """ # Prevent 0 job IDs jobIDs = [int(j) for j in transJobIDs if int(j)] allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.killJob(jobList) if res["OK"]: self.log.info("Successfully killed %d jobs from WMS" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to kill jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to kill jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False res = self.wmsClient.deleteJob(jobList) if res["OK"]: self.log.info("Successfully deleted jobs from WMS", "(n=%d)" % len(jobList)) elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res): self.log.info("Found jobs which did not exist in the WMS", "(n=%d)" % len(res["InvalidJobIDs"])) elif "NonauthorizedJobIDs" in res: self.log.error("Failed to delete jobs because not authorized", "(n=%d)" % len(res["NonauthorizedJobIDs"])) allRemove = False elif "FailedJobIDs" in res: self.log.error("Failed to delete jobs", "(n=%d)" % len(res["FailedJobIDs"])) allRemove = False if not allRemove: return S_ERROR("Failed to delete all remnants from WMS") self.log.info("Successfully deleted all tasks from the WMS") if not jobIDs: self.log.info( "JobIDs not present, unable to delete associated requests.") return S_OK() failed = 0 failoverRequests = {} res = self.reqClient.getRequestIDsForJobs(jobIDs) if not res["OK"]: self.log.error("Failed to get requestID for jobs.", res["Message"]) return res failoverRequests.update(res["Value"]["Successful"]) if not failoverRequests: return S_OK() for jobID, requestID in res["Value"]["Successful"].items(): # Put this check just in case, tasks must have associated jobs if jobID == 0 or jobID == "0": continue res = self.reqClient.cancelRequest(requestID) if not res["OK"]: self.log.error("Failed to remove request from RequestDB", res["Message"]) failed += 1 else: self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID)) if failed: self.log.info("Successfully removed requests", "(n=%d)" % (len(failoverRequests) - failed)) self.log.info("Failed to remove requests", "(n=%d)" % failed) return S_ERROR("Failed to remove all the request from RequestDB") self.log.info( "Successfully removed all the associated failover requests") return S_OK()
class ComponentSupervisionAgent(AgentModule): """ComponentSupervisionAgent class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( ) def logError(self, errStr, varMsg=""): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.addressTo = self.am_getOption("MailTo", self.addressTo) self.addressFrom = self.am_getOption("MailFrom", self.addressFrom) self.controlComponents = self.am_getOption("ControlComponents", self.controlComponents) self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs) self.doNotRestartInstancePattern = self.am_getOption( "DoNotRestartInstancePattern", self.doNotRestartInstancePattern) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType="Agents") if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType="Executors") if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType="Services") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not (self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.items(): rows.append([[instanceName], [val.get("Treatment", "No Treatment")], [str(val.get("LogAge", "Not Relevant"))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=" | ") if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res["OK"]: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType="Agents", runitStatus="Run"): """Return a dict of running agents, executors or services. Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError( "Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res["Value"][instanceType] runningComponents = defaultdict(dict) for system, components in val.items(): for componentName, componentInfo in components.items(): if componentInfo["Setup"] and componentInfo["Installed"]: if runitStatus != "All" and componentInfo[ "RunitStatus"] != runitStatus: continue for option, default in (("PollingTime", HOUR), ("Port", None), ("Protocol", None)): runningComponents[componentName][ option] = self._getComponentOption( instanceType, system, componentName, option, default) # remove empty values so we can use defaults in _getURL if not runningComponents[componentName][option]: runningComponents[componentName].pop(option) runningComponents[componentName][ "LogFileLocation"] = os.path.join( self.diracLocation, "runit", system, componentName, "log", "current") runningComponents[componentName]["PID"] = componentInfo[ "PID"] runningComponents[componentName]["Module"] = componentInfo[ "Module"] runningComponents[componentName][ "RunitStatus"] = componentInfo["RunitStatus"] runningComponents[componentName]["System"] = system return S_OK(runningComponents) def _getComponentOption(self, instanceType, system, componentName, option, default): """Get component option from DIRAC CS, using components' base classes methods.""" componentPath = PathFinder.getComponentSection( system=system, component=componentName, setup=self.setup, componentCategory=instanceType, ) if instanceType != "Agents": return gConfig.getValue(Path.cfgPath(componentPath, option), default) # deal with agent configuration componentLoadModule = gConfig.getValue( Path.cfgPath(componentPath, "Module"), componentName) fullComponentName = Path.cfgPath(system, componentName) fullComponentLoadName = Path.cfgPath(system, componentLoadModule) return AgentModule(fullComponentName, fullComponentLoadName).am_getOption( option, default) def on_terminate(self, componentName, process): """Execute callback when a process terminates gracefully.""" self.log.info( "%s's process with ID: %s has been terminated successfully" % (componentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ("executor", "agent", "service"): for name, options in getattr(self, instanceType + "s").items(): # call checkAgent, checkExecutor, checkService res = getattr(self, "check" + instanceType.capitalize())(name, options) if not res["OK"]: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res["Message"])) res = self.componentControl() if not res["OK"]: if "Stopped does not exist" not in res[ "Message"] and "Running does not exist" not in res[ "Message"]: self.logError("Failure to control components", res["Message"]) if not self.errors: res = self.checkURLs() if not res["OK"]: self.logError("Failure to check URLs", res["Message"]) else: self.logError( "Something was wrong before, not checking URLs this time") self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR("Failed to access logfile %s: %r" % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info( "Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) if any(pattern in instanceName for pattern in self.doNotRestartInstancePattern): self.log.info( "Restarting for %s is disabled, please restart it manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: componentProc = psutil.Process(int(pid)) processesToTerminate = componentProc.children(recursive=True) processesToTerminate.append(componentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial( self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes["OK"]: self.log.info("Failure pinging service: %s: %s" % (url, pingRes["Message"])) res = self.restartInstance(int(options["PID"]), serviceName, self.restartServices) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[serviceName][ "Treatment"] = "Successfully Restarted" self.log.info("Service %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = (options["PollingTime"], options["LogFileLocation"], options["PID"]) self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options["LogFileLocation"] pid = options["PID"] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res["OK"]: return res if res["OK"] and res["Value"] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res["OK"] and res["Value"] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {"Status": "Checking", "MinorStatus": executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs["OK"]: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs["Message"])) return resJobs if resJobs["Value"]: self.log.info('Found %d jobs in "Checking" status for %s' % (len(resJobs["Value"]), executorName)) return S_OK(CHECKING_JOBS) self.log.info('Found no jobs in "Checking" status for %s' % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent["OK"]: return resCurrent currentStatus = resCurrent["Value"] resDefault = self._getDefaultComponentStatus() if not resDefault["OK"]: return resDefault defaultStatus = resDefault["Value"] # ensure instances are in the right state shouldBe = {} shouldBe["Run"] = defaultStatus["Run"].intersection( currentStatus["Down"]) shouldBe["Down"] = defaultStatus["Down"].intersection( currentStatus["Run"]) shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference( currentStatus["All"]) self._ensureComponentRunning(shouldBe["Run"]) self._ensureComponentDown(shouldBe["Down"]) for instance in shouldBe["Unknown"]: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall["OK"]: return resOverall currentStatus = {"Down": set(), "Run": set(), "All": set()} informationDict = resOverall["Value"] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = "%s__%s" % (system, instanceName) runitStatus = instanceInfoDict.get("RunitStatus") if runitStatus in ("Run", "Down"): currentStatus[runitStatus].add(identifier) currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.getfqdn() defaultStatus = {"Down": set(), "Run": set(), "All": set()} resRunning = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Running")) resStopped = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Stopped")) if not resRunning["OK"]: return resRunning if not resStopped["OK"]: return resStopped defaultStatus["Run"] = set(resRunning["Value"]) defaultStatus["Down"] = set(resStopped["Value"]) defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"] if defaultStatus["Run"].intersection(defaultStatus["Down"]): self.logError( "Overlap in configuration", str(defaultStatus["Run"].intersection(defaultStatus["Down"]))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res["OK"]: self.logError("Failed to start component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was down, started instance" else: self.accounting[instance][ "Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res["OK"]: self.logError("Failed to stop component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance][ "Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) # get port used for https based services try: tornadoSystemInstance = PathFinder.getSystemInstance( system="Tornado", setup=self.setup, ) self._tornadoPort = gConfig.getValue( Path.cfgPath("/System/Tornado/", tornadoSystemInstance, "Port"), self._tornadoPort, ) except RuntimeError: pass self.log.debug("Using Tornado Port:", self._tornadoPort) res = self.getRunningInstances(instanceType="Services", runitStatus="All") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in sorted(self.services.items()): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if "SystemAdministrator" in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result["OK"]: self.logError("Commit to CS failed", result["Message"]) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options["System"] module = options["Module"] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = Path.cfgPath( PathFinder.getSystemURLSection(system=system, setup=self.setup), module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options["RunitStatus"] wouldHave = "Would have " if not self.commitURLs else "" if runitStatus == "Run" and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == "Down" and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) def _getURL(self, serviceName, options): """Return URL for the service.""" system = options["System"] port = options.get("Port", self._tornadoPort) host = socket.getfqdn() protocol = options.get("Protocol", "dips") url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName) return url
class WorkflowTasks(TaskBase): """Handles jobs""" def __init__( self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None, ): """Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger(self.__class__.__name__) super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( "Transformations/DestinationPlugin", "BySE") else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None def prepareTransformationTasks(self, transBody, taskDict, owner="", ownerGroup="", ownerDN="", bulkSubmissionFlag=False): """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works. :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :param bool bulkSubmissionFlag: flag for using bulk submission or not :return: S_OK/S_ERROR with updated taskDict """ if (not owner) or (not ownerGroup): res = getProxyInfo(False, False) if not res["OK"]: return res proxyInfo = res["Value"] owner = proxyInfo["username"] ownerGroup = proxyInfo["group"] if not ownerDN: res = getDNForUsername(owner) if not res["OK"]: return res ownerDN = res["Value"][0] if bulkSubmissionFlag: return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN) # not a bulk submission return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN) def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a single job object for bulk submission :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param str owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasksBulk" startTime = time.time() # Prepare the bulk Job object with common parameters oJob = self.jobClass(transBody) self._logVerbose("Setting job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) oJob.setOwner(owner) oJob.setOwnerGroup(ownerGroup) oJob.setOwnerDN(ownerDN) try: site = oJob.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJob.workflow.findParameter("JobType").getValue() transGroup = str(transID).zfill(8) # Verify that the JOB_ID parameter is added to the workflow if not oJob.workflow.findParameter("JOB_ID"): oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if oJob.workflow.findParameter("PRODUCTION_ID"): oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, # pylint: disable=protected-access "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID", ) oJob.setType(jobType) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJob.setJobGroup(transGroup) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) # Collect per job parameters sequences paramSeqDict = {} # tasks must be sorted because we use bulk submission and we must find the correspondance for taskID in sorted(taskDict): paramsDict = taskDict[taskID] seqDict = {} if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Handle destination site sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) return S_ERROR(ETSUKN, "Can not evaluate destination site") else: self._logVerbose("Setting Site: ", str(sites), transID=transID, method=method) seqDict["Site"] = sites seqDict["JobName"] = self._transTaskName(transID, taskID) seqDict["JOB_ID"] = str(taskID).zfill(8) self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # Handle Input Data inputData = paramsDict.get("InputData") if inputData: if isinstance(inputData, six.string_types): inputData = inputData.replace(" ", "").split(";") self._logVerbose("Setting input data to %s" % inputData, transID=transID, method=method) seqDict["InputData"] = inputData elif paramSeqDict.get("InputData") is not None: self._logError( "Invalid mixture of jobs with and without input data") return S_ERROR( ETSDATA, "Invalid mixture of jobs with and without input data") for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logVerbose("Setting %s to %s" % (paramName, paramValue), transID=transID, method=method) seqDict[paramName] = paramValue outputParameterList = [] if self.outputDataModule: res = self.getOutputData({ "Job": oJob._toXML(), # pylint: disable=protected-access "TransformationID": transID, "TaskID": taskID, "InputData": inputData, }) if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): seqDict[name] = output outputParameterList.append(name) if oJob.workflow.findParameter(name): oJob._setParamValue(name, "%%(%s)s" % name) # pylint: disable=protected-access else: oJob._addParameter( oJob.workflow, name, "JDL", "%%(%s)s" % name, name # pylint: disable=protected-access ) for pName, seq in seqDict.items(): paramSeqDict.setdefault(pName, []).append(seq) for paramName, paramSeq in paramSeqDict.items(): if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData" ] + outputParameterList: res = oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName) else: res = oJob.setParameterSequence(paramName, paramSeq) if not res["OK"]: return res if taskDict: self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) taskDict["BulkJobObject"] = oJob return S_OK(taskDict) def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN): """Prepare transformation tasks with a job object per task :param str transBody: transformation job template :param dict taskDict: dictionary of per task parameters :param owner: owner of the transformation :param str ownerGroup: group of the owner of the transformation :param str ownerDN: DN of the owner of the transformation :return: S_OK/S_ERROR with updated taskDict """ if taskDict: transID = list(taskDict.values())[0]["TransformationID"] else: return S_OK({}) method = "__prepareTasks" startTime = time.time() oJobTemplate = self.jobClass(transBody) oJobTemplate.setOwner(owner) oJobTemplate.setOwnerGroup(ownerGroup) oJobTemplate.setOwnerDN(ownerDN) try: site = oJobTemplate.workflow.findParameter("Site").getValue() except AttributeError: site = None jobType = oJobTemplate.workflow.findParameter("JobType").getValue() templateOK = False getOutputDataTiming = 0.0 for taskID, paramsDict in taskDict.items(): # Create a job for each task and add it to the taskDict if not templateOK: templateOK = True # Update the template with common information self._logVerbose("Job owner:group to %s:%s" % (owner, ownerGroup), transID=transID, method=method) transGroup = str(transID).zfill(8) self._logVerbose("Adding default transformation group of %s" % (transGroup), transID=transID, method=method) oJobTemplate.setJobGroup(transGroup) if oJobTemplate.workflow.findParameter("PRODUCTION_ID"): oJobTemplate._setParamValue("PRODUCTION_ID", str(transID).zfill(8)) else: oJobTemplate._addParameter(oJobTemplate.workflow, "PRODUCTION_ID", "string", str(transID).zfill(8), "Production ID") if not oJobTemplate.workflow.findParameter("JOB_ID"): oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID", "string", "00000000", "Initial JOB_ID") if site is not None: paramsDict["Site"] = site paramsDict["JobType"] = jobType # Now create the job from the template oJob = copy.deepcopy(oJobTemplate) constructedName = self._transTaskName(transID, taskID) self._logVerbose("Setting task name to %s" % constructedName, transID=transID, method=method) oJob.setName(constructedName) oJob._setParamValue("JOB_ID", str(taskID).zfill(8)) inputData = None self._logDebug( "TransID: %s, TaskID: %s, paramsDict: %s" % (transID, taskID, str(paramsDict)), transID=transID, method=method, ) # These helper functions do the real job sites = self._handleDestination(paramsDict) if not sites: self._logError("Could not get a list a sites", transID=transID, method=method) paramsDict["TaskObject"] = "" continue else: self._logDebug("Setting Site: ", str(sites), transID=transID, method=method) res = oJob.setDestination(sites) if not res["OK"]: self._logError("Could not set the site: %s" % res["Message"], transID=transID, method=method) paramsDict["TaskObject"] = "" continue self._handleInputs(oJob, paramsDict) self._handleRest(oJob, paramsDict) clinicPath = self._checkSickTransformations(transID) if clinicPath: self._handleHospital(oJob, clinicPath) paramsDict["TaskObject"] = "" if self.outputDataModule: getOutputDataTiming -= time.time() res = self.getOutputData({ "Job": oJob._toXML(), "TransformationID": transID, "TaskID": taskID, "InputData": inputData }) getOutputDataTiming += time.time() if not res["OK"]: self._logError("Failed to generate output data", res["Message"], transID=transID, method=method) continue for name, output in res["Value"].items(): oJob._addJDLParameter(name, ";".join(output)) paramsDict["TaskObject"] = oJob if taskDict: self._logVerbose( "Average getOutputData time: %.1f per task" % (getOutputDataTiming / len(taskDict)), transID=transID, method=method, ) self._logInfo("Prepared %d tasks" % len(taskDict), transID=transID, method=method, reftime=startTime) return S_OK(taskDict) ############################################################################# def _handleDestination(self, paramsDict): """Handle Sites and TargetSE in the parameters""" try: sites = ["ANY"] if paramsDict["Site"]: # 'Site' comes from the XML and therefore is ; separated sites = fromChar(paramsDict["Site"], sepChar=";") except KeyError: pass if self.destinationPlugin_o: destinationPlugin_o = self.destinationPlugin_o else: res = self.__generatePluginObject(self.destinationPlugin) if not res["OK"]: self._logFatal( "Could not generate a destination plugin object") return res destinationPlugin_o = res["Value"] self.destinationPlugin_o = destinationPlugin_o destinationPlugin_o.setParameters(paramsDict) destSites = destinationPlugin_o.run() if not destSites: return sites # Now we need to make the AND with the sites, if defined if sites != ["ANY"]: # Need to get the AND destSites &= set(sites) return list(destSites) def _handleInputs(self, oJob, paramsDict): """set job inputs (+ metadata)""" inputData = paramsDict.get("InputData") transID = paramsDict["TransformationID"] if inputData: self._logVerbose("Setting input data to %s" % inputData, transID=transID, method="_handleInputs") res = oJob.setInputData(inputData) if not res["OK"]: self._logError("Could not set the inputs: %s" % res["Message"], transID=transID, method="_handleInputs") def _handleRest(self, oJob, paramsDict): """add as JDL parameters all the other parameters that are not for inputs or destination""" transID = paramsDict["TransformationID"] for paramName, paramValue in paramsDict.items(): if paramName not in ("InputData", "Site", "TargetSE"): if paramValue: self._logDebug("Setting %s to %s" % (paramName, paramValue), transID=transID, method="_handleRest") oJob._addJDLParameter(paramName, paramValue) def _checkSickTransformations(self, transID): """Check if the transformation is in the transformations to be processed at Hospital or Clinic""" transID = int(transID) clinicPath = "Hospital" if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath if "Clinics" in self.opsH.getSections("Hospital").get("Value", []): basePath = os.path.join("Hospital", "Clinics") clinics = self.opsH.getSections(basePath)["Value"] for clinic in clinics: clinicPath = os.path.join(basePath, clinic) if transID in set( int(x) for x in self.opsH.getValue( os.path.join(clinicPath, "Transformations"), [])): return clinicPath return None def _handleHospital(self, oJob, clinicPath): """Optional handle of hospital/clinic jobs""" if not clinicPath: return oJob.setInputDataPolicy("download", dataScheduling=False) # Check first for a clinic, if not it must be the general hospital hospitalSite = self.opsH.getValue( os.path.join(clinicPath, "ClinicSite"), "") hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"), []) # If not found, get the hospital parameters if not hospitalSite: hospitalSite = self.opsH.getValue("Hospital/HospitalSite", "DIRAC.JobDebugger.ch") if not hospitalCEs: hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", []) oJob.setDestination(hospitalSite) if hospitalCEs: oJob._addJDLParameter("GridCE", hospitalCEs) def __generatePluginObject(self, plugin): """This simply instantiates the TaskManagerPlugin class with the relevant plugin name""" method = "__generatePluginObject" try: plugModule = __import__(self.pluginLocation, globals(), locals(), ["TaskManagerPlugin"]) except ImportError as e: self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e), method=method) return S_ERROR() try: plugin_o = getattr(plugModule, "TaskManagerPlugin")("%s" % plugin, operationsHelper=self.opsH) return S_OK(plugin_o) except AttributeError as e: self._logException("Failed to create %s(): %s." % (plugin, e), method=method) return S_ERROR() ############################################################################# def getOutputData(self, paramDict): """Get the list of job output LFNs from the provided plugin""" if not self.outputDataModule_o: # Create the module object moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.outputDataModule, None) if not moduleInstance["OK"]: return moduleInstance self.outputDataModule_o = moduleInstance["Value"] # This is the "argument" to the module, set it and then execute self.outputDataModule_o.paramDict = paramDict return self.outputDataModule_o.execute() def submitTransformationTasks(self, taskDict): """Submit the tasks""" if "BulkJobObject" in taskDict: return self.__submitTransformationTasksBulk(taskDict) return self.__submitTransformationTasks(taskDict) def __submitTransformationTasksBulk(self, taskDict): """Submit jobs in one go with one parametric job""" if not taskDict: return S_OK(taskDict) startTime = time.time() method = "__submitTransformationTasksBulk" oJob = taskDict.pop("BulkJobObject") # we can only do this, once the job has been popped, or we _might_ crash transID = list(taskDict.values())[0]["TransformationID"] if oJob is None: self._logError("no bulk Job object found", transID=transID, method=method) return S_ERROR(ETSUKN, "No bulk job object provided for submission") result = self.submitTaskToExternal(oJob) if not result["OK"]: self._logError("Failed to submit tasks to external", transID=transID, method=method) return result jobIDList = result["Value"] if len(jobIDList) != len(taskDict): for task in taskDict.values(): task["Success"] = False return S_ERROR( ETSUKN, "Submitted less number of jobs than requested tasks") # Get back correspondence with tasks sorted by ID for jobID, taskID in zip(jobIDList, sorted(taskDict)): taskDict[taskID]["ExternalID"] = jobID taskDict[taskID]["Success"] = True submitted = len(jobIDList) self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) return S_OK(taskDict) def __submitTransformationTasks(self, taskDict): """Submit jobs one by one""" method = "__submitTransformationTasks" submitted = 0 failed = 0 startTime = time.time() for task in taskDict.values(): transID = task["TransformationID"] if not task["TaskObject"]: task["Success"] = False failed += 1 continue res = self.submitTaskToExternal(task["TaskObject"]) if res["OK"]: task["ExternalID"] = res["Value"] task["Success"] = True submitted += 1 else: self._logError("Failed to submit task to WMS", res["Message"], transID=transID, method=method) task["Success"] = False failed += 1 if submitted: self._logInfo( "Submitted %d tasks to WMS in %.1f seconds" % (submitted, time.time() - startTime), transID=transID, method=method, ) if failed: self._logError("Failed to submit %d tasks to WMS." % (failed), transID=transID, method=method) return S_OK(taskDict) def submitTaskToExternal(self, job): """Submits a single job (which can be a bulk one) to the WMS.""" if isinstance(job, six.string_types): try: oJob = self.jobClass(job) except Exception as x: # pylint: disable=broad-except self._logException("Failed to create job object", "", x) return S_ERROR("Failed to create job object") elif isinstance(job, self.jobClass): oJob = job else: self._logError("No valid job description found") return S_ERROR("No valid job description found") workflowFileObject = StringIO(oJob._toXML()) jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject) return self.submissionClient.submitJob(jdl, workflowFileObject) def updateTransformationReservedTasks(self, taskDicts): transID = None jobNames = [ self._transTaskName(taskDict["TransformationID"], taskDict["TaskID"]) for taskDict in taskDicts ] res = self.jobMonitoringClient.getJobs({"JobName": jobNames}) if not res["OK"]: self._logError( "Failed to get task from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) return res jobNameIDs = {} for wmsID in res["Value"]: res = self.jobMonitoringClient.getJobSummary(int(wmsID)) if not res["OK"]: self._logWarn( "Failed to get task summary from WMS", res["Message"], transID=transID, method="updateTransformationReservedTasks", ) else: jobNameIDs[res["Value"]["JobName"]] = int(wmsID) noTask = list(set(jobNames) - set(jobNameIDs)) return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs}) def getSubmittedTaskStatus(self, taskDicts): """ Check the status of a list of tasks and return lists of taskIDs for each new status """ method = "getSubmittedTaskStatus" if taskDicts: wmsIDs = [ int(taskDict["ExternalID"]) for taskDict in taskDicts if int(taskDict["ExternalID"]) ] transID = taskDicts[0]["TransformationID"] else: return S_OK({}) res = self.jobMonitoringClient.getJobsStatus(wmsIDs) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] updateDict = {} for taskDict in taskDicts: taskID = taskDict["TaskID"] wmsID = int(taskDict["ExternalID"]) if not wmsID: continue oldStatus = taskDict["ExternalStatus"] newStatus = statusDict.get(wmsID, {}).get("Status", "Removed") if oldStatus != newStatus: if newStatus == "Removed": self._logVerbose( "Production/Job %d/%d removed from WMS while it is in %s status" % (transID, taskID, oldStatus), transID=transID, method=method, ) newStatus = "Failed" self._logVerbose( "Setting job status for Production/Job %d/%d to %s" % (transID, taskID, newStatus), transID=transID, method=method, ) updateDict.setdefault(newStatus, []).append(taskID) return S_OK(updateDict) def getSubmittedFileStatus(self, fileDicts): """ Check the status of a list of files and return the new status of each LFN """ if not fileDicts: return S_OK({}) method = "getSubmittedFileStatus" # All files are from the same transformation transID = fileDicts[0]["TransformationID"] taskFiles = {} for fileDict in fileDicts: jobName = self._transTaskName(transID, fileDict["TaskID"]) taskFiles.setdefault(jobName, {})[fileDict["LFN"]] = fileDict["Status"] res = self.updateTransformationReservedTasks(fileDicts) if not res["OK"]: self._logWarn("Failed to obtain taskIDs for files", transID=transID, method=method) return res noTasks = res["Value"]["NoTasks"] taskNameIDs = res["Value"]["TaskNameIDs"] updateDict = {} for jobName in noTasks: for lfn, oldStatus in taskFiles[jobName].items(): if oldStatus != TransformationFilesStatus.UNUSED: updateDict[lfn] = TransformationFilesStatus.UNUSED res = self.jobMonitoringClient.getJobsStatus(list( taskNameIDs.values())) if not res["OK"]: self._logWarn("Failed to get job status from the WMS system", transID=transID, method=method) return res statusDict = res["Value"] for jobName, wmsID in taskNameIDs.items(): jobStatus = statusDict.get(wmsID, {}).get("Status") newFileStatus = { "Done": TransformationFilesStatus.PROCESSED, "Completed": TransformationFilesStatus.PROCESSED, "Failed": TransformationFilesStatus.UNUSED, }.get(jobStatus) if newFileStatus: for lfn, oldStatus in taskFiles[jobName].items(): if newFileStatus != oldStatus: updateDict[lfn] = newFileStatus return S_OK(updateDict)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """# Now, let's submit some jobs. Different sites, types, inputs""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [["/a/1.txt", "/a/2.txt"], ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []] types = ["User", "Test"] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"}, msg="Got %s" % res["Value"]) res = jobMonitor.getJobTypes() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(sorted(res["Value"]), sorted(types), msg="Got %s" % str(sorted(res["Value"]))) res = jobMonitor.getApplicationStates() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], ["app status", "Unknown"], msg="Got %s" % str(res["Value"])) res = jobMonitor.getOwners() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getOwnerGroup() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getProductionIds() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobGroups() self.assertTrue(res["OK"], res.get("Message")) resJG_empty = res["Value"] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanNow = res["Value"] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanOneYear = res["Value"] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)), resJG_olderThanOneYear) res = jobMonitor.getStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [[JobStatus.RECEIVED], sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"]) res = jobMonitor.getMinorStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [ ["Job accepted"], sorted(["Job accepted", "Job Rescheduled"]), sorted(["Job accepted", "Marked for termination"]), ], res["Value"], ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobs() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"]) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { "Status": JobStatus.CHECKING, "MinorStatus": "MinorStatus", "Source": "Unknown", } }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus") res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)): { "Status": JobStatus.WAITING, "MinorStatus": "MinorStatus", "Source": "Unknown", }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)): { "Status": JobStatus.MATCHED, "MinorStatus": "MinorStatus-matched", "Source": "Unknown", }, }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]}) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobAttribute(jobID, "Status", JobStatus.RUNNING) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)