Exemplo n.º 1
0
class WorkflowTasks(TaskBase):
    """ Handles jobs
  """
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None,
                 destinationPlugin=None,
                 ownerDN=None,
                 ownerGroup=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                'Transformations/DestinationPlugin', 'BySE')
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None

    def prepareTransformationTasks(self,
                                   transBody,
                                   taskDict,
                                   owner='',
                                   ownerGroup='',
                                   ownerDN='',
                                   bulkSubmissionFlag=False):
        """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.


    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param str owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation
    :param bool bulkSubmissionFlag: flag for using bulk submission or not

    :return: S_OK/S_ERROR with updated taskDict
    """

        if (not owner) or (not ownerGroup):
            res = getProxyInfo(False, False)
            if not res['OK']:
                return res
            proxyInfo = res['Value']
            owner = proxyInfo['username']
            ownerGroup = proxyInfo['group']

        if not ownerDN:
            res = getDNForUsername(owner)
            if not res['OK']:
                return res
            ownerDN = res['Value'][0]

        if bulkSubmissionFlag:
            return self.__prepareTasksBulk(transBody, taskDict, owner,
                                           ownerGroup, ownerDN)
        # not a bulk submission
        return self.__prepareTasks(transBody, taskDict, owner, ownerGroup,
                                   ownerDN)

    def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup,
                           ownerDN):
        """ Prepare transformation tasks with a single job object for bulk submission

    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param str owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation

    :return: S_OK/S_ERROR with updated taskDict
    """
        if taskDict:
            transID = taskDict.values()[0]['TransformationID']
        else:
            return S_OK({})

        method = '__prepareTasksBulk'
        startTime = time.time()

        # Prepare the bulk Job object with common parameters
        oJob = self.jobClass(transBody)
        self._logVerbose('Setting job owner:group to %s:%s' %
                         (owner, ownerGroup),
                         transID=transID,
                         method=method)
        oJob.setOwner(owner)
        oJob.setOwnerGroup(ownerGroup)
        oJob.setOwnerDN(ownerDN)

        jobType = oJob.workflow.findParameter('JobType').getValue()
        transGroup = str(transID).zfill(8)

        # Verify that the JOB_ID parameter is added to the workflow
        if not oJob.workflow.findParameter('JOB_ID'):
            oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000',
                               "Initial JOB_ID")

        if oJob.workflow.findParameter('PRODUCTION_ID'):
            oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8))  # pylint: disable=protected-access
        else:
            oJob._addParameter(
                oJob.workflow,  # pylint: disable=protected-access
                'PRODUCTION_ID',
                'string',
                str(transID).zfill(8),
                "Production ID")
        oJob.setType(jobType)
        self._logVerbose('Adding default transformation group of %s' %
                         (transGroup),
                         transID=transID,
                         method=method)
        oJob.setJobGroup(transGroup)

        if int(transID) in [
                int(x)
                for x in self.opsH.getValue("Hospital/Transformations", [])
        ]:
            self._handleHospital(oJob)

        # Collect per job parameters sequences
        paramSeqDict = {}
        # tasks must be sorted because we use bulk submission and we must find the correspondance
        for taskID in sorted(taskDict):
            paramsDict = taskDict[taskID]
            seqDict = {}

            paramsDict['JobType'] = jobType

            # Handle destination site
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError('Could not get a list a sites',
                               transID=transID,
                               method=method)
                return S_ERROR(ETSUKN, "Can not evaluate destination site")
            else:
                self._logVerbose('Setting Site: ',
                                 str(sites),
                                 transID=transID,
                                 method=method)
                seqDict['Site'] = sites

            seqDict['JobName'] = self._transTaskName(transID, taskID)
            seqDict['JOB_ID'] = str(taskID).zfill(8)

            self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' %
                           (transID, taskID, str(paramsDict)),
                           transID=transID,
                           method=method)

            # Handle Input Data
            inputData = paramsDict.get('InputData')
            if inputData:
                if isinstance(inputData, basestring):
                    inputData = inputData.replace(' ', '').split(';')
                self._logVerbose('Setting input data to %s' % inputData,
                                 transID=transID,
                                 method=method)
                seqDict['InputData'] = inputData
            elif paramSeqDict.get('InputData') is not None:
                self._logError(
                    "Invalid mixture of jobs with and without input data")
                return S_ERROR(
                    ETSDATA,
                    "Invalid mixture of jobs with and without input data")

            for paramName, paramValue in paramsDict.iteritems():
                if paramName not in ('InputData', 'Site', 'TargetSE'):
                    if paramValue:
                        self._logVerbose('Setting %s to %s' %
                                         (paramName, paramValue),
                                         transID=transID,
                                         method=method)
                        seqDict[paramName] = paramValue

            outputParameterList = []
            if self.outputDataModule:
                res = self.getOutputData({
                    'Job': oJob._toXML(),
                    'TransformationID': transID,  # pylint: disable=protected-access
                    'TaskID': taskID,
                    'InputData': inputData
                })
                if not res['OK']:
                    self._logError("Failed to generate output data",
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res['Value'].iteritems():
                    seqDict[name] = output
                    outputParameterList.append(name)
                    if oJob.workflow.findParameter(name):
                        oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
                    else:
                        oJob._addParameter(
                            oJob.workflow,  # pylint: disable=protected-access
                            name,
                            'JDL',
                            "%%(%s)s" % name,
                            name)

            for pName, seq in seqDict.iteritems():
                paramSeqDict.setdefault(pName, []).append(seq)

        for paramName, paramSeq in paramSeqDict.iteritems():
            if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'
                             ] + outputParameterList:
                res = oJob.setParameterSequence(paramName,
                                                paramSeq,
                                                addToWorkflow=paramName)
            else:
                res = oJob.setParameterSequence(paramName, paramSeq)
            if not res['OK']:
                return res

        if taskDict:
            self._logInfo('Prepared %d tasks' % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)

        taskDict['BulkJobObject'] = oJob
        return S_OK(taskDict)

    def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
        """ Prepare transformation tasks with a job object per task

    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation

    :return:  S_OK/S_ERROR with updated taskDict
    """
        if taskDict:
            transID = taskDict.values()[0]['TransformationID']
        else:
            return S_OK({})

        method = '__prepareTasks'
        startTime = time.time()
        oJobTemplate = self.jobClass(transBody)
        oJobTemplate.setOwner(owner)
        oJobTemplate.setOwnerGroup(ownerGroup)
        oJobTemplate.setOwnerDN(ownerDN)
        try:
            site = oJobTemplate.workflow.findParameter('Site').getValue()
        except AttributeError:
            site = None
        jobType = oJobTemplate.workflow.findParameter('JobType').getValue()
        templateOK = False
        getOutputDataTiming = 0.
        for taskID, paramsDict in taskDict.iteritems():
            # Create a job for each task and add it to the taskDict
            if not templateOK:
                templateOK = True
                # Update the template with common information
                self._logVerbose('Job owner:group to %s:%s' %
                                 (owner, ownerGroup),
                                 transID=transID,
                                 method=method)
                transGroup = str(transID).zfill(8)
                self._logVerbose('Adding default transformation group of %s' %
                                 (transGroup),
                                 transID=transID,
                                 method=method)
                oJobTemplate.setJobGroup(transGroup)
                if oJobTemplate.workflow.findParameter('PRODUCTION_ID'):
                    oJobTemplate._setParamValue('PRODUCTION_ID',
                                                str(transID).zfill(8))
                else:
                    oJobTemplate._addParameter(oJobTemplate.workflow,
                                               'PRODUCTION_ID', 'string',
                                               str(transID).zfill(8),
                                               "Production ID")
                if not oJobTemplate.workflow.findParameter('JOB_ID'):
                    oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID',
                                               'string', '00000000',
                                               "Initial JOB_ID")

            if site is not None:
                paramsDict['Site'] = site
            paramsDict['JobType'] = jobType
            # Now create the job from the template
            oJob = copy.deepcopy(oJobTemplate)
            constructedName = self._transTaskName(transID, taskID)
            self._logVerbose('Setting task name to %s' % constructedName,
                             transID=transID,
                             method=method)
            oJob.setName(constructedName)
            oJob._setParamValue('JOB_ID', str(taskID).zfill(8))
            inputData = None

            self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' %
                           (transID, taskID, str(paramsDict)),
                           transID=transID,
                           method=method)

            # These helper functions do the real job
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError('Could not get a list a sites',
                               transID=transID,
                               method=method)
                paramsDict['TaskObject'] = ''
                continue
            else:
                self._logDebug('Setting Site: ',
                               str(sites),
                               transID=transID,
                               method=method)
                res = oJob.setDestination(sites)
                if not res['OK']:
                    self._logError('Could not set the site: %s' %
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    paramsDict['TaskObject'] = ''
                    continue

            self._handleInputs(oJob, paramsDict)
            self._handleRest(oJob, paramsDict)

            hospitalTrans = [
                int(x)
                for x in self.opsH.getValue("Hospital/Transformations", [])
            ]
            if int(transID) in hospitalTrans:
                self._handleHospital(oJob)

            paramsDict['TaskObject'] = ''
            if self.outputDataModule:
                getOutputDataTiming -= time.time()
                res = self.getOutputData({
                    'Job': oJob._toXML(),
                    'TransformationID': transID,
                    'TaskID': taskID,
                    'InputData': inputData
                })
                getOutputDataTiming += time.time()
                if not res['OK']:
                    self._logError("Failed to generate output data",
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res['Value'].iteritems():
                    oJob._addJDLParameter(name, ';'.join(output))
            paramsDict['TaskObject'] = oJob
        if taskDict:
            self._logVerbose('Average getOutputData time: %.1f per task' %
                             (getOutputDataTiming / len(taskDict)),
                             transID=transID,
                             method=method)
            self._logInfo('Prepared %d tasks' % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)
        return S_OK(taskDict)

    #############################################################################

    def _handleDestination(self, paramsDict):
        """ Handle Sites and TargetSE in the parameters
    """

        try:
            sites = ['ANY']
            if paramsDict['Site']:
                # 'Site' comes from the XML and therefore is ; separated
                sites = fromChar(paramsDict['Site'], sepChar=';')
        except KeyError:
            pass

        if self.destinationPlugin_o:
            destinationPlugin_o = self.destinationPlugin_o
        else:
            res = self.__generatePluginObject(self.destinationPlugin)
            if not res['OK']:
                self._logFatal(
                    "Could not generate a destination plugin object")
                return res
            destinationPlugin_o = res['Value']
            self.destinationPlugin_o = destinationPlugin_o

        destinationPlugin_o.setParameters(paramsDict)
        destSites = destinationPlugin_o.run()
        if not destSites:
            return sites

        # Now we need to make the AND with the sites, if defined
        if sites != ['ANY']:
            # Need to get the AND
            destSites &= set(sites)

        return list(destSites)

    def _handleInputs(self, oJob, paramsDict):
        """ set job inputs (+ metadata)
    """
        inputData = paramsDict.get('InputData')
        transID = paramsDict['TransformationID']
        if inputData:
            self._logVerbose('Setting input data to %s' % inputData,
                             transID=transID,
                             method='_handleInputs')
            res = oJob.setInputData(inputData)
            if not res['OK']:
                self._logError("Could not set the inputs: %s" % res['Message'],
                               transID=transID,
                               method='_handleInputs')

    def _handleRest(self, oJob, paramsDict):
        """ add as JDL parameters all the other parameters that are not for inputs or destination
    """
        transID = paramsDict['TransformationID']
        for paramName, paramValue in paramsDict.iteritems():
            if paramName not in ('InputData', 'Site', 'TargetSE'):
                if paramValue:
                    self._logDebug('Setting %s to %s' %
                                   (paramName, paramValue),
                                   transID=transID,
                                   method='_handleRest')
                    oJob._addJDLParameter(paramName, paramValue)

    def _handleHospital(self, oJob):
        """ Optional handle of hospital jobs
    """
        oJob.setType('Hospital')
        oJob.setInputDataPolicy('download', dataScheduling=False)
        hospitalSite = self.opsH.getValue("Hospital/HospitalSite",
                                          'DIRAC.JobDebugger.ch')
        oJob.setDestination(hospitalSite)
        hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])
        if hospitalCEs:
            oJob._addJDLParameter('GridCE', hospitalCEs)

    def __generatePluginObject(self, plugin):
        """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name
    """
        method = '__generatePluginObject'
        try:
            plugModule = __import__(self.pluginLocation, globals(), locals(),
                                    ['TaskManagerPlugin'])
        except ImportError as e:
            self._logException("Failed to import 'TaskManagerPlugin' %s: %s" %
                               (plugin, e),
                               method=method)
            return S_ERROR()
        try:
            plugin_o = getattr(plugModule,
                               'TaskManagerPlugin')('%s' % plugin,
                                                    operationsHelper=self.opsH)
            return S_OK(plugin_o)
        except AttributeError as e:
            self._logException("Failed to create %s(): %s." % (plugin, e),
                               method=method)
            return S_ERROR()

    #############################################################################

    def getOutputData(self, paramDict):
        """ Get the list of job output LFNs from the provided plugin
    """
        if not self.outputDataModule_o:
            # Create the module object
            moduleFactory = ModuleFactory()

            moduleInstance = moduleFactory.getModule(self.outputDataModule,
                                                     None)
            if not moduleInstance['OK']:
                return moduleInstance
            self.outputDataModule_o = moduleInstance['Value']
        # This is the "argument" to the module, set it and then execute
        self.outputDataModule_o.paramDict = paramDict
        return self.outputDataModule_o.execute()

    def submitTransformationTasks(self, taskDict):
        """ Submit the tasks
    """
        if 'BulkJobObject' in taskDict:
            return self.__submitTransformationTasksBulk(taskDict)
        return self.__submitTransformationTasks(taskDict)

    def __submitTransformationTasksBulk(self, taskDict):
        """ Submit jobs in one go with one parametric job
    """
        if not taskDict:
            return S_OK(taskDict)
        startTime = time.time()

        method = '__submitTransformationTasksBulk'

        oJob = taskDict.pop('BulkJobObject')
        # we can only do this, once the job has been popped, or we _might_ crash
        transID = taskDict.values()[0]['TransformationID']
        if oJob is None:
            self._logError('no bulk Job object found',
                           transID=transID,
                           method=method)
            return S_ERROR(ETSUKN,
                           'No bulk job object provided for submission')

        result = self.submitTaskToExternal(oJob)
        if not result['OK']:
            self._logError('Failed to submit tasks to external',
                           transID=transID,
                           method=method)
            return result

        jobIDList = result['Value']
        if len(jobIDList) != len(taskDict):
            for task in taskDict.values():
                task['Success'] = False
            return S_ERROR(
                ETSUKN, 'Submitted less number of jobs than requested tasks')
        # Get back correspondance with tasks sorted by ID
        for jobID, taskID in zip(jobIDList, sorted(taskDict)):
            taskDict[taskID]['ExternalID'] = jobID
            taskDict[taskID]['Success'] = True

        submitted = len(jobIDList)
        self._logInfo('Submitted %d tasks to WMS in %.1f seconds' %
                      (submitted, time.time() - startTime),
                      transID=transID,
                      method=method)
        return S_OK(taskDict)

    def __submitTransformationTasks(self, taskDict):
        """ Submit jobs one by one
    """
        method = '__submitTransformationTasks'
        submitted = 0
        failed = 0
        startTime = time.time()
        for task in taskDict.itervalues():
            transID = task['TransformationID']
            if not task['TaskObject']:
                task['Success'] = False
                failed += 1
                continue
            res = self.submitTaskToExternal(task['TaskObject'])
            if res['OK']:
                task['ExternalID'] = res['Value']
                task['Success'] = True
                submitted += 1
            else:
                self._logError("Failed to submit task to WMS",
                               res['Message'],
                               transID=transID,
                               method=method)
                task['Success'] = False
                failed += 1
        if submitted:
            self._logInfo('Submitted %d tasks to WMS in %.1f seconds' %
                          (submitted, time.time() - startTime),
                          transID=transID,
                          method=method)
        if failed:
            self._logError('Failed to submit %d tasks to WMS.' % (failed),
                           transID=transID,
                           method=method)
        return S_OK(taskDict)

    def submitTaskToExternal(self, job):
        """ Submits a single job (which can be a bulk one) to the WMS.
    """
        if isinstance(job, basestring):
            try:
                oJob = self.jobClass(job)
            except Exception as x:  # pylint: disable=broad-except
                self._logException("Failed to create job object", '', x)
                return S_ERROR("Failed to create job object")
        elif isinstance(job, self.jobClass):
            oJob = job
        else:
            self._logError("No valid job description found")
            return S_ERROR("No valid job description found")

        workflowFileObject = StringIO.StringIO(oJob._toXML())
        jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
        return self.submissionClient.submitJob(jdl, workflowFileObject)

    def updateTransformationReservedTasks(self, taskDicts):
        transID = None
        jobNames = [
            self._transTaskName(taskDict['TransformationID'],
                                taskDict['TaskID']) for taskDict in taskDicts
        ]
        res = self.jobMonitoringClient.getJobs({'JobName': jobNames})
        if not res['OK']:
            self._logError("Failed to get task from WMS",
                           res['Message'],
                           transID=transID,
                           method='updateTransformationReservedTasks')
            return res
        jobNameIDs = {}
        for wmsID in res['Value']:
            res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID))
            if not res['OK']:
                self._logWarn("Failed to get task summary from WMS",
                              res['Message'],
                              transID=transID,
                              method='updateTransformationReservedTasks')
            else:
                jobNameIDs[res['Value']['JobName']] = int(wmsID)

        noTask = list(set(jobNames) - set(jobNameIDs))
        return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs})

    def getSubmittedTaskStatus(self, taskDicts):
        """
    Check the status of a list of tasks and return lists of taskIDs for each new status
    """
        method = 'getSubmittedTaskStatus'

        if taskDicts:
            wmsIDs = [
                int(taskDict['ExternalID']) for taskDict in taskDicts
                if int(taskDict['ExternalID'])
            ]
            transID = taskDicts[0]['TransformationID']
        else:
            return S_OK({})
        res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
        if not res['OK']:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res['Value']
        updateDict = {}
        for taskDict in taskDicts:
            taskID = taskDict['TaskID']
            wmsID = int(taskDict['ExternalID'])
            if not wmsID:
                continue
            oldStatus = taskDict['ExternalStatus']
            newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed')
            if oldStatus != newStatus:
                if newStatus == "Removed":
                    self._logVerbose(
                        'Production/Job %d/%d removed from WMS while it is in %s status'
                        % (transID, taskID, oldStatus),
                        transID=transID,
                        method=method)
                    newStatus = "Failed"
                self._logVerbose(
                    'Setting job status for Production/Job %d/%d to %s' %
                    (transID, taskID, newStatus),
                    transID=transID,
                    method=method)
                updateDict.setdefault(newStatus, []).append(taskID)
        return S_OK(updateDict)

    def getSubmittedFileStatus(self, fileDicts):
        """
    Check the status of a list of files and return the new status of each LFN
    """
        if not fileDicts:
            return S_OK({})

        method = 'getSubmittedFileStatus'

        # All files are from the same transformation
        transID = fileDicts[0]['TransformationID']
        taskFiles = {}
        for fileDict in fileDicts:
            jobName = self._transTaskName(transID, fileDict['TaskID'])
            taskFiles.setdefault(jobName,
                                 {})[fileDict['LFN']] = fileDict['Status']

        res = self.updateTransformationReservedTasks(fileDicts)
        if not res['OK']:
            self._logWarn("Failed to obtain taskIDs for files",
                          transID=transID,
                          method=method)
            return res
        noTasks = res['Value']['NoTasks']
        taskNameIDs = res['Value']['TaskNameIDs']

        updateDict = {}
        for jobName in noTasks:
            for lfn, oldStatus in taskFiles[jobName].iteritems():
                if oldStatus != 'Unused':
                    updateDict[lfn] = 'Unused'

        res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values())
        if not res['OK']:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res['Value']
        for jobName, wmsID in taskNameIDs.iteritems():
            jobStatus = statusDict.get(wmsID, {}).get('Status')
            newFileStatus = {
                'Done': 'Processed',
                'Completed': 'Processed',
                'Failed': 'Unused'
            }.get(jobStatus)
            if newFileStatus:
                for lfn, oldStatus in taskFiles[jobName].iteritems():
                    if newFileStatus != oldStatus:
                        updateDict[lfn] = newFileStatus
        return S_OK(updateDict)
Exemplo n.º 2
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        jobIDs = []
        dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for dest in dests:
            for lfns in lfnss:
                for jobType in types:
                    job = helloWorldJob()
                    job.setDestination(dest)
                    job.setInputData(lfns)
                    job.setType(jobType)
                    jobDescription = createFile(job)
                    res = wmsClient.submitJob(
                        job._toJDL(xmlFile=jobDescription))
                    self.assert_(res['OK'])
                    jobID = res['Value']
                    jobIDs.append(jobID)

        res = jobMonitor.getSites()
        self.assert_(res['OK'])
        self.assert_(
            set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch']))
        res = jobMonitor.getJobTypes()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(types))
        res = jobMonitor.getApplicationStates()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(['Unknown']))

        res = jobMonitor.getOwners()
        self.assert_(res['OK'])
        res = jobMonitor.getOwnerGroup()
        self.assert_(res['OK'])
        res = jobMonitor.getProductionIds()
        self.assert_(res['OK'])
        res = jobMonitor.getJobGroups()
        self.assert_(res['OK'])
        res = jobMonitor.getStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(['Job accepted', 'matching'])])
        self.assert_(res['OK'])
        res = jobMonitor.getJobs()
        self.assert_(res['OK'])
        self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assert_( res['OK'] )
        res = jobMonitor.getCurrentJobCounters()
        self.assert_(res['OK'])
        try:
            self.assert_(
                res['Value'].get('Received') + res['Value'].get('Waiting') >=
                long(len(dests) * len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assert_(res['OK'])
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assert_(res['OK'])

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assert_(res['OK'])

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Exemplo n.º 3
0
class WorkflowTasks(TaskBase):
  """ Handles jobs
  """

  def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None,
               outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None,
               ownerDN=None, ownerGroup=None):
    """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

    if not logger:
      logger = gLogger.getSubLogger('WorkflowTasks')

    super(WorkflowTasks, self).__init__(transClient, logger)

    useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False
    if not submissionClient:
      self.submissionClient = WMSClient(useCertificates=useCertificates,
                                        delegatedDN=ownerDN,
                                        delegatedGroup=ownerGroup)
    else:
      self.submissionClient = submissionClient

    if not jobMonitoringClient:
      self.jobMonitoringClient = JobMonitoringClient()
    else:
      self.jobMonitoringClient = jobMonitoringClient

    if not jobClass:
      self.jobClass = Job
    else:
      self.jobClass = jobClass

    if not opsH:
      self.opsH = Operations()
    else:
      self.opsH = opsH

    if not outputDataModule:
      self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "")
    else:
      self.outputDataModule = outputDataModule

    if not destinationPlugin:
      self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE')
    else:
      self.destinationPlugin = destinationPlugin

    self.destinationPlugin_o = None

    self.outputDataModule_o = None

  def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='',
                                 ownerDN='', bulkSubmissionFlag=False):
    """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.

    :param transBody: transformation job template
    :param taskDict: dictionary of per task parameters
    :param owner: owner of the transformation
    :param ownerGroup: group of the owner of the transformation
    :param ownerDN: DN of the owner of the transformation
    :return:  S_OK/S_ERROR with updated taskDict
    """

    if (not owner) or (not ownerGroup):
      res = getProxyInfo(False, False)
      if not res['OK']:
        return res
      proxyInfo = res['Value']
      owner = proxyInfo['username']
      ownerGroup = proxyInfo['group']

    if not ownerDN:
      res = getDNForUsername(owner)
      if not res['OK']:
        return res
      ownerDN = res['Value'][0]

    if bulkSubmissionFlag:
      return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN)
    return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN)

  def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN):
    """ Prepare transformation tasks with a single job object for bulk submission
    """
    if taskDict:
      transID = taskDict.values()[0]['TransformationID']
    else:
      return S_OK({})

    # Prepare the bulk Job object with common parameters
    oJob = self.jobClass(transBody)
    method = 'prepareTransformationTasksBulk'

    self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup),
                     transID=transID, method=method)
    oJob.setOwner(owner)
    oJob.setOwnerGroup(ownerGroup)
    oJob.setOwnerDN(ownerDN)

    jobType = oJob.workflow.findParameter('JobType').getValue()
    transGroup = str(transID).zfill(8)

    # Verify that the JOB_ID parameter is added to the workflow
    if not oJob.workflow.findParameter('JOB_ID'):
      oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID")

    if oJob.workflow.findParameter('PRODUCTION_ID'):
      oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8))  # pylint: disable=protected-access
    else:
      oJob._addParameter(oJob.workflow,  # pylint: disable=protected-access
                         'PRODUCTION_ID',
                         'string',
                         str(transID).zfill(8),
                         "Production ID")
    oJob.setType(jobType)
    self._logVerbose('Adding default transformation group of %s' % (transGroup),
                     transID=transID, method=method)
    oJob.setJobGroup(transGroup)

    if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]:
      self._handleHospital(oJob)

    # Collect per job parameters sequences
    paramSeqDict = {}
    # tasks must be sorted because we use bulk submission and we must find the correspondance
    for taskID in sorted(taskDict):
      paramsDict = taskDict[taskID]
      seqDict = {}

      # Handle destination site
      sites = self._handleDestination(paramsDict)
      if not sites:
        self._logError('Could not get a list a sites', transID=transID)
        return S_ERROR(ETSUKN, "Can not evaluate destination site")
      else:
        self._logVerbose('Setting Site: ', str(sites), transID=transID)
        seqDict['Site'] = sites

      seqDict['JobName'] = self._transTaskName(transID, taskID)
      seqDict['JOB_ID'] = str(taskID).zfill(8)

      self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)),
                     transID=transID, method=method)

      # Handle Input Data
      inputData = paramsDict.get('InputData')
      if inputData:
        self._logVerbose('Setting input data to %s' % inputData,
                         transID=transID, method=method)
        seqDict['InputData'] = inputData
      elif paramSeqDict.get('InputData') is not None:
        self._logError("Invalid mixture of jobs with and without input data")
        return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data")

      for paramName, paramValue in paramsDict.iteritems():
        if paramName not in ('InputData', 'Site', 'TargetSE'):
          if paramValue:
            self._logVerbose('Setting %s to %s' % (paramName, paramValue),
                             transID=transID, method=method)
            seqDict[paramName] = paramValue

      outputParameterList = []
      if self.outputDataModule:
        res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID,  # pylint: disable=protected-access
                                  'TaskID': taskID, 'InputData': inputData})
        if not res['OK']:
          self._logError("Failed to generate output data", res['Message'],
                         transID=transID, method=method)
          continue
        for name, output in res['Value'].iteritems():
          seqDict[name] = output
          outputParameterList.append(name)
          if oJob.workflow.findParameter(name):
            oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
          else:
            oJob._addParameter(oJob.workflow,  # pylint: disable=protected-access
                               name,
                               'JDL',
                               "%%(%s)s" % name,
                               name)

      for pName, seq in seqDict.iteritems():
        paramSeqDict.setdefault(pName, []).append(seq)

    for paramName, paramSeq in paramSeqDict.iteritems():
      if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList:
        oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName)
      else:
        oJob.setParameterSequence(paramName, paramSeq)

    taskDict['BulkJobObject'] = oJob
    return S_OK(taskDict)

  def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
    """ Prepare transformation tasks with a job object per task
    """
    method = '__prepareTasks'
    startTime = time.time()
    oJobTemplate = self.jobClass(transBody)
    oJobTemplate.setOwner(owner)
    oJobTemplate.setOwnerGroup(ownerGroup)
    oJobTemplate.setOwnerDN(ownerDN)
    site = oJobTemplate.workflow.findParameter('Site').getValue()
    jobType = oJobTemplate.workflow.findParameter('JobType').getValue()
    templateOK = False
    getOutputDataTiming = 0.
    for taskID, paramsDict in taskDict.iteritems():
      # Create a job for each task and add it to the taskDict
      if not templateOK:
        templateOK = True
        # Update the template with common information
        transID = paramsDict['TransformationID']
        self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup),
                         transID=transID, method=method)
        transGroup = str(transID).zfill(8)
        self._logVerbose('Adding default transformation group of %s' % (transGroup),
                         transID=transID, method=method)
        oJobTemplate.setJobGroup(transGroup)
        if oJobTemplate.workflow.findParameter('PRODUCTION_ID'):
          oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8))
        else:
          oJobTemplate._addParameter(oJobTemplate.workflow,
                                     'PRODUCTION_ID',
                                     'string',
                                     str(transID).zfill(8),
                                     "Production ID")
        if not oJobTemplate.workflow.findParameter('JOB_ID'):
          oJobTemplate._addParameter(oJobTemplate.workflow,
                                     'JOB_ID',
                                     'string',
                                     '00000000',
                                     "Initial JOB_ID")

      paramsDict['Site'] = site
      paramsDict['JobType'] = jobType
      # Now create the job from the template
      oJob = copy.deepcopy(oJobTemplate)
      constructedName = self._transTaskName(transID, taskID)
      self._logVerbose('Setting task name to %s' % constructedName,
                       transID=transID, method=method)
      oJob.setName(constructedName)
      oJob._setParamValue('JOB_ID', str(taskID).zfill(8))
      inputData = None

      self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)),
                     transID=transID, method=method)

      # These helper functions do the real job
      sites = self._handleDestination(paramsDict)
      if not sites:
        self._logError('Could not get a list a sites',
                       transID=transID, method=method)
        paramsDict['TaskObject'] = ''
        continue
      else:
        self._logDebug('Setting Site: ', str(sites),
                       transID=transID, method=method)
        res = oJob.setDestination(sites)
        if not res['OK']:
          self._logError('Could not set the site: %s' % res['Message'],
                         transID=transID, method=method)
          continue

      self._handleInputs(oJob, paramsDict)
      self._handleRest(oJob, paramsDict)

      hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]
      if int(transID) in hospitalTrans:
        self._handleHospital(oJob)

      paramsDict['TaskObject'] = ''
      if self.outputDataModule:
        getOutputDataTiming -= time.time()
        res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID,
                                  'TaskID': taskID, 'InputData': inputData})
        getOutputDataTiming += time.time()
        if not res['OK']:
          self._logError("Failed to generate output data", res['Message'],
                         transID=transID, method=method)
          continue
        for name, output in res['Value'].iteritems():
          oJob._addJDLParameter(name, ';'.join(output))
      paramsDict['TaskObject'] = oJob
    if taskDict:
      self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)),
                       transID=transID, method=method)
      self._logInfo('Prepared %d tasks' % len(taskDict),
                    transID=transID, method=method, reftime=startTime)
    return S_OK(taskDict)

  #############################################################################

  def _handleDestination(self, paramsDict):
    """ Handle Sites and TargetSE in the parameters
    """

    try:
      sites = ['ANY']
      if paramsDict['Site']:
        # 'Site' comes from the XML and therefore is ; separated
        sites = fromChar(paramsDict['Site'], sepChar=';')
    except KeyError:
      pass

    if self.destinationPlugin_o:
      destinationPlugin_o = self.destinationPlugin_o
    else:
      res = self.__generatePluginObject(self.destinationPlugin)
      if not res['OK']:
        self._logFatal("Could not generate a destination plugin object")
        return res
      destinationPlugin_o = res['Value']
      self.destinationPlugin_o = destinationPlugin_o

    destinationPlugin_o.setParameters(paramsDict)
    destSites = destinationPlugin_o.run()
    if not destSites:
      return sites

    # Now we need to make the AND with the sites, if defined
    if sites != ['ANY']:
      # Need to get the AND
      destSites &= set(sites)

    return list(destSites)

  def _handleInputs(self, oJob, paramsDict):
    """ set job inputs (+ metadata)
    """
    inputData = paramsDict.get('InputData')
    transID = paramsDict['TransformationID']
    if inputData:
      self._logVerbose('Setting input data to %s' % inputData,
                       transID=transID, method='handleInputs')
      oJob.setInputData(inputData)

  def _handleRest(self, oJob, paramsDict):
    """ add as JDL parameters all the other parameters that are not for inputs or destination
    """
    transID = paramsDict['TransformationID']
    for paramName, paramValue in paramsDict.iteritems():
      if paramName not in ('InputData', 'Site', 'TargetSE'):
        if paramValue:
          self._logDebug('Setting %s to %s' % (paramName, paramValue),
                         transID=transID, method='handleRest')
          oJob._addJDLParameter(paramName, paramValue)

  def _handleHospital(self, oJob):
    """ Optional handle of hospital jobs
    """
    oJob.setType('Hospital')
    oJob.setInputDataPolicy('download', dataScheduling=False)
    hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch')
    oJob.setDestination(hospitalSite)
    hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])
    if hospitalCEs:
      oJob._addJDLParameter('GridCE', hospitalCEs)

  def __generatePluginObject(self, plugin):
    """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name
    """
    try:
      plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin'])
    except ImportError as e:
      self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e))
      return S_ERROR()
    try:
      plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH)
      return S_OK(plugin_o)
    except AttributeError as e:
      self._logException("Failed to create %s(): %s." % (plugin, e))
      return S_ERROR()

  #############################################################################

  def getOutputData(self, paramDict):
    """ Get the list of job output LFNs from the provided plugin
    """
    if not self.outputDataModule_o:
      # Create the module object
      moduleFactory = ModuleFactory()

      moduleInstance = moduleFactory.getModule(self.outputDataModule, None)
      if not moduleInstance['OK']:
        return moduleInstance
      self.outputDataModule_o = moduleInstance['Value']
    # This is the "argument" to the module, set it and then execute
    self.outputDataModule_o.paramDict = paramDict
    return self.outputDataModule_o.execute()

  def submitTransformationTasks(self, taskDict):
    """ Submit the tasks
    """
    if 'BulkJobObject' in taskDict:
      return self.__submitTransformationTasksBulk(taskDict)
    return self.__submitTransformationTasks(taskDict)

  def __submitTransformationTasksBulk(self, taskDict):
    """ Submit jobs in one go with one parametric job
    """
    if not taskDict:
      return S_OK(taskDict)
    startTime = time.time()

    oJob = taskDict.pop('BulkJobObject')
    # we can only do this, once the job has been popped, or we _might_ crash
    transID = taskDict.values()[0]['TransformationID']
    if oJob is None:
      self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk')
      return S_ERROR(ETSUKN, 'No bulk job object provided for submission')

    result = self.submitTaskToExternal(oJob)
    if not result['OK']:
      return result

    jobIDList = result['Value']
    if len(jobIDList) != len(taskDict):
      for task in taskDict.values():
        task['Success'] = False
      return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks')
    # Get back correspondance with tasks sorted by ID
    for jobID, taskID in zip(jobIDList, sorted(taskDict)):
      taskDict[taskID]['ExternalID'] = jobID
      taskDict[taskID]['Success'] = True

    submitted = len(jobIDList)
    self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime),
                  transID=transID, method='submitTransformationTasksBulk')
    return S_OK(taskDict)

  def __submitTransformationTasks(self, taskDict):
    """ Submit jobs one by one
    """
    method = 'submitTransformationTasks'
    submitted = 0
    failed = 0
    startTime = time.time()
    for task in taskDict.itervalues():
      transID = task['TransformationID']
      if not task['TaskObject']:
        task['Success'] = False
        failed += 1
        continue
      res = self.submitTaskToExternal(task['TaskObject'])
      if res['OK']:
        task['ExternalID'] = res['Value']
        task['Success'] = True
        submitted += 1
      else:
        self._logError("Failed to submit task to WMS", res['Message'],
                       transID=transID, method=method)
        task['Success'] = False
        failed += 1
    if submitted:
      self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime),
                    transID=transID, method=method)
    if failed:
      self._logError('Failed to submit %d tasks to WMS.' % (failed),
                     transID=transID, method=method)
    return S_OK(taskDict)

  def submitTaskToExternal(self, job):
    """ Submits a single job to the WMS.
    """
    if isinstance(job, basestring):
      try:
        oJob = self.jobClass(job)
      except Exception as x:  # pylint: disable=broad-except
        self._logException("Failed to create job object", '', x)
        return S_ERROR("Failed to create job object")
    elif isinstance(job, self.jobClass):
      oJob = job
    else:
      self._logError("No valid job description found")
      return S_ERROR("No valid job description found")

    workflowFileObject = StringIO.StringIO(oJob._toXML())
    jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
    return self.submissionClient.submitJob(jdl, workflowFileObject)

  def updateTransformationReservedTasks(self, taskDicts):
    transID = None
    jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts]
    res = self.jobMonitoringClient.getJobs({'JobName': jobNames})
    if not res['OK']:
      self._logError("Failed to get task from WMS", res['Message'],
                     transID=transID, method='updateTransformationReservedTasks')
      return res
    jobNameIDs = {}
    for wmsID in res['Value']:
      res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID))
      if not res['OK']:
        self._logWarn("Failed to get task summary from WMS", res['Message'],
                      transID=transID, method='updateTransformationReservedTasks')
      else:
        jobNameIDs[res['Value']['JobName']] = int(wmsID)

    noTask = list(set(jobNames) - set(jobNameIDs))
    return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs})

  def getSubmittedTaskStatus(self, taskDicts):
    """
    Check the status of a list of tasks and return lists of taskIDs for each new status
    """
    if taskDicts:
      wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])]
      transID = taskDicts[0]['TransformationID']
    else:
      return S_OK({})
    res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
    if not res['OK']:
      self._logWarn("Failed to get job status from the WMS system",
                    transID=transID)
      return res
    statusDict = res['Value']
    updateDict = {}
    for taskDict in taskDicts:
      taskID = taskDict['TaskID']
      wmsID = int(taskDict['ExternalID'])
      if not wmsID:
        continue
      oldStatus = taskDict['ExternalStatus']
      newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed')
      if oldStatus != newStatus:
        if newStatus == "Removed":
          self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' %
                           (transID, taskID, oldStatus), transID=transID)
          newStatus = "Failed"
        self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus),
                         transID=transID)
        updateDict.setdefault(newStatus, []).append(taskID)
    return S_OK(updateDict)

  def getSubmittedFileStatus(self, fileDicts):
    """
    Check the status of a list of files and return the new status of each LFN
    """
    if not fileDicts:
      return S_OK({})
    # All files are from the same transformation
    transID = fileDicts[0]['TransformationID']
    taskFiles = {}
    for fileDict in fileDicts:
      jobName = self._transTaskName(transID, fileDict['TaskID'])
      taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status']

    res = self.updateTransformationReservedTasks(fileDicts)
    if not res['OK']:
      self._logWarn("Failed to obtain taskIDs for files",
                    transID=transID)
      return res
    noTasks = res['Value']['NoTasks']
    taskNameIDs = res['Value']['TaskNameIDs']

    updateDict = {}
    for jobName in noTasks:
      for lfn, oldStatus in taskFiles[jobName].iteritems():
        if oldStatus != 'Unused':
          updateDict[lfn] = 'Unused'

    res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values())
    if not res['OK']:
      self._logWarn("Failed to get job status from the WMS system",
                    transID=transID)
      return res
    statusDict = res['Value']
    for jobName, wmsID in taskNameIDs.iteritems():
      jobStatus = statusDict.get(wmsID, {}).get('Status')
      newFileStatus = {'Done': 'Processed',
                       'Completed': 'Processed',
                       'Failed': 'Unused'}.get(jobStatus)
      if newFileStatus:
        for lfn, oldStatus in taskFiles[jobName].iteritems():
          if newFileStatus != oldStatus:
            updateDict[lfn] = newFileStatus
    return S_OK(updateDict)
Exemplo n.º 4
0
  def test_JobStateUpdateAndJobMonitoringMultuple( self ):
    """ # Now, let's submit some jobs. Different sites, types, inputs
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    jobIDs = []
    dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
    lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
    types = ['User', 'Test']
    for dest in dests:
      for lfns in lfnss:
        for jobType in types:
          job = helloWorldJob()
          job.setDestination( dest )
          job.setInputData( lfns )
          job.setType( jobType )
          jobDescription = createFile( job )
          res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
          self.assert_( res['OK'] )
          jobID = res['Value']
          jobIDs.append( jobID )

    res = jobMonitor.getSites()
    self.assert_( res['OK'] )
    self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) )
    res = jobMonitor.getJobTypes()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( types ) )
    res = jobMonitor.getApplicationStates()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) )

    res = jobMonitor.getOwners()
    self.assert_( res['OK'] )
    res = jobMonitor.getOwnerGroup()
    self.assert_( res['OK'] )
    res = jobMonitor.getProductionIds()
    self.assert_( res['OK'] )
    res = jobMonitor.getJobGroups()
    self.assert_( res['OK'] )
    res = jobMonitor.getStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] )
    res = jobMonitor.getMinorStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobs()
    self.assert_( res['OK'] )
    self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) )
#     res = jobMonitor.getCounters(attrList)
#     self.assert_( res['OK'] )
    res = jobMonitor.getCurrentJobCounters()
    self.assert_( res['OK'] )
    try:
      self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) )
    except TypeError:
      pass
    res = jobMonitor.getJobsSummary( jobIDs )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 )
    self.assert_( res['OK'] )

    res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running',
                                                                                      'MinorStatus': 'MinorStatus',
                                                                                      'ApplicationStatus': 'ApplicationStatus',
                                                                                      'Source': 'Unknown'}} )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} )
    self.assert_( res['OK'] )

    # delete the jobs - this will just set its status to "deleted"
    wmsClient.deleteJob( jobIDs )
Exemplo n.º 5
0
class MonitorAgents(AgentModule):
  """MonitorAgents class."""

  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()

  def logError(self, errStr, varMsg=''):
    """Append errors to a list, which is sent in email notification."""
    self.log.error(errStr, varMsg)
    self.errors.append(errStr + " " + varMsg)

  def beginExecution(self):
    """Reload the configurations before every cycle."""
    self.setup = self.am_getOption("Setup", self.setup)
    self.enabled = self.am_getOption("EnableFlag", self.enabled)
    self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents)
    self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors)
    self.restartServices = self.am_getOption("RestartServices", self.restartServices)
    self.diracLocation = os.environ.get("DIRAC", self.diracLocation)
    self.addressTo = self.am_getOption('MailTo', self.addressTo)
    self.addressFrom = self.am_getOption('MailFrom', self.addressFrom)
    self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents)
    self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs)

    self.csAPI = CSAPI()

    res = self.getRunningInstances(instanceType='Agents')
    if not res["OK"]:
      return S_ERROR("Failure to get running agents")
    self.agents = res["Value"]

    res = self.getRunningInstances(instanceType='Executors')
    if not res["OK"]:
      return S_ERROR("Failure to get running executors")
    self.executors = res["Value"]

    res = self.getRunningInstances(instanceType='Services')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]

    self.accounting.clear()
    return S_OK()

  def sendNotification(self):
    """Send email notification about changes done in the last cycle."""
    if not(self.errors or self.accounting):
      return S_OK()

    emailBody = ""
    rows = []
    for instanceName, val in self.accounting.iteritems():
      rows.append([[instanceName],
                   [val.get('Treatment', 'No Treatment')],
                   [str(val.get('LogAge', 'Not Relevant'))]])

    if rows:
      columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
      emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ')

    if self.errors:
      emailBody += "\n\nErrors:"
      emailBody += "\n".join(self.errors)

    self.log.notice("Sending Email:\n" + emailBody)
    for address in self.addressTo:
      res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False)
      if not res['OK']:
        self.log.error("Failure to send Email notification to ", address)
        continue

    self.errors = []
    self.accounting.clear()

    return S_OK()

  def getRunningInstances(self, instanceType='Agents', runitStatus='Run'):
    """Return a dict of running agents, executors or services.

    Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

    :param str instanceType: 'Agents', 'Executors', 'Services'
    :param str runitStatus: Return only those instances with given RunitStatus or 'All'
    :returns: Dictionary of running instances
    """
    res = self.sysAdminClient.getOverallStatus()
    if not res["OK"]:
      self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"])
      return res

    val = res['Value'][instanceType]
    runningAgents = defaultdict(dict)
    for system, agents in val.iteritems():
      for agentName, agentInfo in agents.iteritems():
        if agentInfo['Setup'] and agentInfo['Installed']:
          if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus:
            continue
          confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName)
          for option, default in (('PollingTime', HOUR), ('Port', None)):
            optPath = os.path.join(confPath, option)
            runningAgents[agentName][option] = gConfig.getValue(optPath, default)
          runningAgents[agentName]["LogFileLocation"] = \
              os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current')
          runningAgents[agentName]["PID"] = agentInfo["PID"]
          runningAgents[agentName]['Module'] = agentInfo['Module']
          runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus']
          runningAgents[agentName]['System'] = system

    return S_OK(runningAgents)

  def on_terminate(self, agentName, process):
    """Execute callback when a process terminates gracefully."""
    self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid))

  def execute(self):
    """Execute checks for agents, executors, services."""
    for instanceType in ('executor', 'agent', 'service'):
      for name, options in getattr(self, instanceType + 's').iteritems():
        # call checkAgent, checkExecutor, checkService
        res = getattr(self, 'check' + instanceType.capitalize())(name, options)
        if not res['OK']:
          self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message']))

    res = self.componentControl()
    if not res['OK']:
      if "Stopped does not exist" not in res['Message'] and \
         "Running does not exist" not in res['Message']:
        self.logError("Failure to control components", res['Message'])

    if not self.errors:
      res = self.checkURLs()
      if not res['OK']:
        self.logError("Failure to check URLs", res['Message'])
    else:
      self.logError('Something was wrong before, not checking URLs this time')

    self.sendNotification()

    if self.errors:
      return S_ERROR("Error during this cycle, check log")

    return S_OK()

  @staticmethod
  def getLastAccessTime(logFileLocation):
    """Return the age of log file."""
    lastAccessTime = 0
    try:
      lastAccessTime = os.path.getmtime(logFileLocation)
      lastAccessTime = datetime.fromtimestamp(lastAccessTime)
    except OSError as e:
      return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e))

    now = datetime.now()
    age = now - lastAccessTime
    return S_OK(age)

  def restartInstance(self, pid, instanceName, enabled):
    """Kill a process which is then restarted automatically."""
    if not (self.enabled and enabled):
      self.log.info("Restarting is disabled, please restart %s manually" % instanceName)
      self.accounting[instanceName]["Treatment"] = "Please restart it manually"
      return S_OK(NO_RESTART)

    try:
      agentProc = psutil.Process(int(pid))
      processesToTerminate = agentProc.children(recursive=True)
      processesToTerminate.append(agentProc)

      for proc in processesToTerminate:
        proc.terminate()

      _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5,
                                       callback=partial(self.on_terminate, instanceName))
      for proc in alive:
        self.log.info("Forcefully killing process %s" % proc.pid)
        proc.kill()

      return S_OK()

    except psutil.Error as err:
      self.logError("Exception occurred in terminating processes", "%s" % err)
      return S_ERROR()

  def checkService(self, serviceName, options):
    """Ping the service, restart if the ping does not respond."""
    url = self._getURL(serviceName, options)
    self.log.info("Pinging service", url)
    pingRes = Client().ping(url=url)
    if not pingRes['OK']:
      self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message']))
      res = self.restartInstance(int(options['PID']), serviceName, self.restartServices)
      if not res["OK"]:
        return res
      elif res['OK'] and res['Value'] != NO_RESTART:
        self.accounting[serviceName]["Treatment"] = "Successfully Restarted"
        self.log.info("Agent %s has been successfully restarted" % serviceName)
    self.log.info("Service responded OK")
    return S_OK()

  def checkAgent(self, agentName, options):
    """Check the age of agent's log file, if it is too old then restart the agent."""
    pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID']
    self.log.info("Checking Agent: %s" % agentName)
    self.log.info("Polling Time: %s" % pollingTime)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES)))

    maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
    if age.seconds < maxLogAge:
      return S_OK()

    self.log.info("Current log file is too old for Agent %s" % agentName)
    self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

    res = self.restartInstance(int(pid), agentName, self.restartAgents)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[agentName]["Treatment"] = "Successfully Restarted"
      self.log.info("Agent %s has been successfully restarted" % agentName)

    return S_OK()

  def checkExecutor(self, executor, options):
    """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
    currentLogLocation = options['LogFileLocation']
    pid = options['PID']
    self.log.info("Checking executor: %s" % executor)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES)))

    if age.seconds < 2 * HOUR:
      return S_OK()

    self.log.info("Current log file is too old for Executor %s" % executor)
    self.accounting[executor]["LogAge"] = age.seconds / MINUTES

    res = self.checkForCheckingJobs(executor)
    if not res['OK']:
      return res
    if res['OK'] and res['Value'] == NO_CHECKING_JOBS:
      self.accounting.pop(executor, None)
      return S_OK(NO_RESTART)

    res = self.restartInstance(int(pid), executor, self.restartExecutors)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[executor]["Treatment"] = "Successfully Restarted"
      self.log.info("Executor %s has been successfully restarted" % executor)

    return S_OK()

  def checkForCheckingJobs(self, executorName):
    """Check if there are checking jobs with the **executorName** as current MinorStatus."""
    attrDict = {'Status': 'Checking', 'MinorStatus': executorName}

    # returns list of jobs IDs
    resJobs = self.jobMonClient.getJobs(attrDict)
    if not resJobs['OK']:
      self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message']))
      return resJobs
    if resJobs['Value']:
      self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName))
      return S_OK(CHECKING_JOBS)
    self.log.info("Found no jobs in 'Checking' status for %s" % executorName)
    return S_OK(NO_CHECKING_JOBS)

  def componentControl(self):
    """Monitor and control component status as defined in the CS.

    Check for running and stopped components and ensure they have the proper status as defined in the CS
    Registry/Hosts/_HOST_/[Running|Stopped] sections

    :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
       :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
    """
    # get the current status of the components

    resCurrent = self._getCurrentComponentStatus()
    if not resCurrent['OK']:
      return resCurrent
    currentStatus = resCurrent['Value']

    resDefault = self._getDefaultComponentStatus()
    if not resDefault['OK']:
      return resDefault
    defaultStatus = resDefault['Value']

    # ensure instances are in the right state
    shouldBe = {}
    shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down'])
    shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run'])
    shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All'])

    self._ensureComponentRunning(shouldBe['Run'])
    self._ensureComponentDown(shouldBe['Down'])

    for instance in shouldBe['Unknown']:
      self.logError("Unknown instance", "%r, either uninstall or add to config" % instance)

    return S_OK()

  def _getCurrentComponentStatus(self):
    """Get current status for components."""
    resOverall = self.sysAdminClient.getOverallStatus()
    if not resOverall['OK']:
      return resOverall
    currentStatus = {'Down': set(), 'Run': set(), 'All': set()}
    informationDict = resOverall['Value']
    for systemsDict in informationDict.values():
      for system, instancesDict in systemsDict.items():
        for instanceName, instanceInfoDict in instancesDict.items():
          identifier = '%s__%s' % (system, instanceName)
          runitStatus = instanceInfoDict.get('RunitStatus')
          if runitStatus in ('Run', 'Down'):
            currentStatus[runitStatus].add(identifier)

    currentStatus['All'] = currentStatus['Run'] | currentStatus['Down']
    return S_OK(currentStatus)

  def _getDefaultComponentStatus(self):
    """Get the configured status of the components."""
    host = socket.gethostname()
    defaultStatus = {'Down': set(), 'Run': set(), 'All': set()}
    resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running'))
    resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped'))
    if not resRunning['OK']:
      return resRunning
    if not resStopped['OK']:
      return resStopped
    defaultStatus['Run'] = set(resRunning['Value'].keys())
    defaultStatus['Down'] = set(resStopped['Value'].keys())
    defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down']

    if defaultStatus['Run'].intersection(defaultStatus['Down']):
      self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down'])))
      return S_ERROR("Bad host configuration")

    return S_OK(defaultStatus)

  def _ensureComponentRunning(self, shouldBeRunning):
    """Ensure the correct components are running."""
    for instance in shouldBeRunning:
      self.log.info("Starting instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.startComponent(system, name)
        if not res['OK']:
          self.logError("Failed to start component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was down, started instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is down, should be started"

  def _ensureComponentDown(self, shouldBeDown):
    """Ensure the correct components are not running."""
    for instance in shouldBeDown:
      self.log.info("Stopping instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.stopComponent(system, name)
        if not res['OK']:
          self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was running, stopped instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is running, should be stopped"

  def checkURLs(self):
    """Ensure that the running services have their URL in the Config."""
    self.log.info("Checking URLs")
    # get services again, in case they were started/stop in controlComponents
    gConfig.forceRefresh(fromMaster=True)
    res = self.getRunningInstances(instanceType='Services', runitStatus='All')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]
    for service, options in self.services.iteritems():
      self.log.debug("Checking URL for %s with options %s" % (service, options))
      # ignore SystemAdministrator, does not have URLs
      if 'SystemAdministrator' in service:
        continue
      self._checkServiceURL(service, options)

    if self.csAPI.csModified and self.commitURLs:
      self.log.info("Commiting changes to the CS")
      result = self.csAPI.commit()
      if not result['OK']:
        self.logError('Commit to CS failed', result['Message'])
        return S_ERROR("Failed to commit to CS")
    return S_OK()

  def _checkServiceURL(self, serviceName, options):
    """Ensure service URL is properly configured in the CS."""
    url = self._getURL(serviceName, options)
    system = options['System']
    module = options['Module']
    self.log.info("Checking URLs for %s/%s" % (system, module))
    urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module)
    urls = gConfig.getValue(urlsConfigPath, [])
    self.log.debug("Found configured URLs for %s: %s" % (module, urls))
    self.log.debug("This URL is %s" % url)
    runitStatus = options['RunitStatus']
    wouldHave = 'Would have ' if not self.commitURLs else ''
    if runitStatus == 'Run' and url not in urls:
      urls.append(url)
      message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
    if runitStatus == 'Down' and url in urls:
      urls.remove(url)
      message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

  @staticmethod
  def _getURL(serviceName, options):
    """Return URL for the service."""
    system = options['System']
    port = options['Port']
    host = socket.gethostname()
    url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName)
    return url
Exemplo n.º 6
0
     conditions = {
         'Status': 'Failed',
         'MinorStatus': 'Maximum of reschedulings reached',
         'ApplicationStatus': 'Failed Input Data Resolution '
     }
     prStr = 'all jobs'
     if production:
         prStr = 'production %s' % ' '.join(production)
         if len(production) == 1:
             production = production[0]
         conditions['JobGroup'] = production
     if userName:
         prStr = 'user %s' % userName
         conditions['Owner'] = userName
     gLogger.always('Obtaining IDR jobs for %s' % prStr)
     res = monitoring.getJobs(conditions)
     if not res['OK']:
         gLogger.always(
             'Error selecting jobs for production %s' % str(production),
             res['Message'])
         DIRAC.exit(2)
     if not res['Value']:
         gLogger.always("No jobs found with IDR for production %s" %
                        str(production))
     elif verbose:
         gLogger.always('Selected %d jobs from production %s' %
                        (len(res['Value']), str(production)))
     jobs = [int(job) for job in res['Value']]
     gLogger.always("Obtained %d jobs... Now analyzing them" % len(jobs))
 if not jobs:
     gLogger.always('No jobs to check, exiting...')
Exemplo n.º 7
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination('DIRAC.Jenkins.ch')
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res['OK'], res.get('Message'))
                jobID = res['Value']
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'},
                        msg="Got %s" % res['Value'])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(types),
                         msg="Got %s" % str(sorted(res['Value'])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(['Unknown']),
                         msg="Got %s" % sorted(str(res['Value'])))

        res = jobMonitor.getOwners()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_empty = res['Value']
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanNow = res['Value']
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanOneYear = res['Value']
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)))
        res = jobMonitor.getStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(
                                         ['Job accepted', 'Job Rescheduled'])])
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobs()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getCurrentJobCounters()
        self.assertTrue(res['OK'], res.get('Message'))
        try:
            self.assertTrue(
                res['Value'].get('Received') +
                res['Value'].get('Waiting') >= int(len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assertTrue(res['OK'], res.get('Message'))

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Exemplo n.º 8
0
class TransformationCleaningAgent(AgentModule):
    """
    .. class:: TransformationCleaningAgent

    :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance
    :param ~TransformationClient.TransformationClient transClient: TransformationClient instance
    :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance

    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        self.shifterProxy = None

        # # transformation client
        self.transClient = None
        # # wms client
        self.wmsClient = None
        # # request client
        self.reqClient = None
        # # file catalog client
        self.metadataClient = None

        # # transformations types
        self.transformationTypes = None
        # # directory locations
        self.directoryLocations = ["TransformationDB", "MetadataCatalog"]
        # # transformation metadata
        self.transfidmeta = "TransformationID"
        # # archive periof in days
        self.archiveAfter = 7
        # # transformation log SEs
        self.logSE = "LogSE"
        # # enable/disable execution
        self.enableFlag = "True"

        self.dataProcTTypes = ["MCSimulation", "Merge"]
        self.dataManipTTypes = ["Replication", "Removal"]

    def initialize(self):
        """agent initialisation

        reading and setting config opts

        :param self: self reference
        """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption("shifterProxy",
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            "Transformations/DataProcessing", self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            "Transformations/DataManipulation", self.dataManipTTypes)
        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption("TransfIDMeta",
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption("ArchiveAfter",
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()
        # # job monitoring client
        self.jobMonitoringClient = JobMonitoringClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """execution in one agent's cycle

        :param self: self reference
        """

        self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag)
        if self.enableFlag != "True":
            self.log.info(
                "TransformationCleaningAgent is disabled by configuration option EnableFlag"
            )
            return S_OK("Disabled via CS flag")

        # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
        res = self.transClient.getTransformations({
            "Status":
            "Cleaning",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Failed to get transformations", res["Message"])

        # Obtain the transformations in RemovingFiles status and removes the output files
        res = self.transClient.getTransformations({
            "Status":
            "RemovingFiles",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeRemoval(transDict)
                else:
                    self.log.info(
                        "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeRemoval)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])

        # Obtain the transformations in Completed status and archive if inactive for X days
        olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
        res = self.transClient.getTransformations(
            {
                "Status": "Completed",
                "Type": self.transformationTypes
            },
            older=olderThanTime,
            timeStamp="LastUpdate")
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])
        return S_OK()

    def finalize(self):
        """Only at finalization: will clean ancient transformations (remnants)

            1) get the transformation IDs of jobs that are older than 1 year
            2) find the status of those transformations. Those "Cleaned" and "Archived" will be
               cleaned and archived (again)

        Why doing this here? Basically, it's a race:

        1) the production manager submits a transformation
        2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue,
           so eventually during their (long-ish) cycle they'll work on it.
        3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason).
           So, the status is changed to "Cleaning"
        4) the TransformationCleaningAgent cleans what has been created (maybe, nothing),
           then sets the transformation status to "Cleaned" or "Archived"
        5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in,
           creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet).

        Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent,
        but these 2 agents are already doing a lot of stuff, and are pretty heavy.
        So, we should just clean from time to time.
        What I added here is done only when the agent finalize, and it's quite light-ish operation anyway.
        """
        res = self.jobMonitoringClient.getJobGroups(
            None,
            datetime.utcnow() - timedelta(days=365))
        if not res["OK"]:
            self.log.error("Failed to get job groups", res["Message"])
            return res
        transformationIDs = res["Value"]
        if transformationIDs:
            res = self.transClient.getTransformations(
                {"TransformationID": transformationIDs})
            if not res["OK"]:
                self.log.error("Failed to get transformations", res["Message"])
                return res
            transformations = res["Value"]
            toClean = []
            toArchive = []
            for transDict in transformations:
                if transDict["Status"] == "Cleaned":
                    toClean.append(transDict)
                if transDict["Status"] == "Archived":
                    toArchive.append(transDict)

            for transDict in toClean:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            for transDict in toArchive:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            # Remove JobIDs that were unknown to the TransformationSystem
            jobGroupsToCheck = [
                str(transDict["TransformationID"]).zfill(8)
                for transDict in toClean + toArchive
            ]
            res = self.jobMonitoringClient.getJobs(
                {"JobGroup": jobGroupsToCheck})
            if not res["OK"]:
                return res
            jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
            res = self.__removeWMSTasks(jobIDsToRemove)
            if not res["OK"]:
                return res

        return S_OK()

    def _executeClean(self, transDict):
        """Clean transformation."""
        # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # We just archive
        if transDict["Type"] in self.dataManipTTypes:
            res = self.archiveTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems archiving transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))
        else:
            res = self.cleanTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems cleaning transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeRemoval(self, transDict):
        """Remove files from given transformation."""
        res = self.removeTransformationOutput(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems removing transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeArchive(self, transDict):
        """Archive the given transformation."""
        res = self.archiveTransformation(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems archiving transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

        return S_OK()

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """get the directories for the supplied transformation from the transformation system.
            These directories are used by removeTransformationOutput and cleanTransformation for removing output.

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.verbose(
            "Cleaning Transformation directories of transformation %d" %
            transID)
        directories = []
        if "TransformationDB" in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ["OutputDirectories"])
            if not res["OK"]:
                self.log.error("Failed to obtain transformation directories",
                               res["Message"])
                return res
            transDirectories = []
            if res["Value"]:
                if not isinstance(res["Value"], list):
                    try:
                        transDirectories = ast.literal_eval(res["Value"])
                    except Exception:
                        # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]'
                        transDirectories.append(res["Value"])
                else:
                    transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if "MetadataCatalog" in self.directoryLocations:
            res = self.metadataClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res["OK"]:
                self.log.error("Failed to obtain metadata catalog directories",
                               res["Message"])
                return res
            transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if not directories:
            self.log.info("No output directories found")
        directories = sorted(directories)
        return S_OK(directories)

    @classmethod
    def _addDirs(cls, transID, newDirs, existingDirs):
        """append unique :newDirs: list to :existingDirs: list

        :param self: self reference
        :param int transID: transformationID
        :param list newDirs: src list of paths
        :param list existingDirs: dest list of paths
        """
        for folder in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, str(folder)):
                if folder not in existingDirs:
                    existingDirs.append(os.path.normpath(folder))
        return existingDirs

    #############################################################################
    #
    # These are the methods for performing the cleaning of catalogs and storage
    #

    def cleanContent(self, directory):
        """wipe out everything from catalog under folder :directory:

        :param self: self reference
        :params str directory: folder name
        """
        self.log.verbose("Cleaning Catalog contents")
        res = self.__getCatalogDirectoryContents([directory])
        if not res["OK"]:
            return res
        filesFound = res["Value"]
        if not filesFound:
            self.log.info(
                "No files are registered in the catalog directory %s" %
                directory)
            return S_OK()
        self.log.info(
            "Attempting to remove possible remnants from the catalog and storage",
            "(n=%d)" % len(filesFound))

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(filesFound, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        realFailure = False
        for lfn, reason in res["Value"]["Failed"].items():
            if "File does not exist" in str(reason):
                self.log.warn("File %s not found in some catalog: " % (lfn))
            else:
                self.log.error("Failed to remove file found in the catalog",
                               "%s %s" % (lfn, reason))
                realFailure = True
        if realFailure:
            return S_ERROR("Failed to remove all files found in the catalog")
        return S_OK()

    def __getCatalogDirectoryContents(self, directories):
        """get catalog contents under paths :directories:

        :param self: self reference
        :param list directories: list of paths in catalog
        """
        self.log.info("Obtaining the catalog contents for %d directories:" %
                      len(directories))
        for directory in directories:
            self.log.info(directory)
        activeDirs = directories
        allFiles = {}
        fc = FileCatalog()
        while activeDirs:
            currentDir = activeDirs[0]
            res = returnSingleResult(fc.listDirectory(currentDir))
            activeDirs.remove(currentDir)
            if not res["OK"] and "Directory does not exist" in res[
                    "Message"]:  # FIXME: DFC should return errno
                self.log.info("The supplied directory %s does not exist" %
                              currentDir)
            elif not res["OK"]:
                if "No such file or directory" in res["Message"]:
                    self.log.info("%s: %s" % (currentDir, res["Message"]))
                else:
                    self.log.error(
                        "Failed to get directory %s content" % currentDir,
                        res["Message"])
            else:
                dirContents = res["Value"]
                activeDirs.extend(dirContents["SubDirs"])
                allFiles.update(dirContents["Files"])
        self.log.info("", "Found %d files" % len(allFiles))
        return S_OK(list(allFiles))

    def cleanTransformationLogFiles(self, directory):
        """clean up transformation logs from directory :directory:

        :param self: self reference
        :param str directory: folder name
        """
        self.log.verbose("Removing log files found in the directory",
                         directory)
        res = returnSingleResult(
            StorageElement(self.logSE).removeDirectory(directory,
                                                       recursive=True))
        if not res["OK"]:
            if cmpError(res, errno.ENOENT):  # No such file or directory
                self.log.warn("Transformation log directory does not exist",
                              directory)
                return S_OK()
            self.log.error("Failed to remove log files", res["Message"])
            return res
        self.log.info("Successfully removed transformation log directory")
        return S_OK()

    #############################################################################
    #
    # These are the functional methods for archiving and cleaning transformations
    #

    def removeTransformationOutput(self, transID):
        """This just removes any mention of the output data from the catalog and storage"""
        self.log.info("Removing output data for transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res))
            return S_OK()
        directories = res["Value"]
        for directory in directories:
            if not re.search("/LOG/", directory):
                res = self.cleanContent(directory)
                if not res["OK"]:
                    return res

        self.log.info("Removed %d directories from the catalog \
      and its files from the storage for transformation %s" %
                      (len(directories), transID))
        # Clean ALL the possible remnants found in the metadata catalog
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully removed output of transformation", transID)
        # Change the status of the transformation to RemovedFiles
        res = self.transClient.setTransformationParameter(
            transID, "Status", "RemovedFiles")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to RemovedFiles"
                % (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to RemovedFiles" %
                      (transID))
        return S_OK()

    def archiveTransformation(self, transID):
        """This just removes job from the jobDB and the transformation DB

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.info("Archiving transformation %s" % transID)
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully archived transformation %d" % transID)
        # Change the status of the transformation to archived
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Archived")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Archived" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to Archived" %
                      (transID))
        return S_OK()

    def cleanTransformation(self, transID):
        """This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
        """
        self.log.info("Cleaning transformation", transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res["Message"]))
            return S_OK()
        directories = res["Value"]
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the log files for the jobs
        for directory in directories:
            if re.search("/LOG/", directory):
                res = self.cleanTransformationLogFiles(directory)
                if not res["OK"]:
                    return res
            res = self.cleanContent(directory)
            if not res["OK"]:
                return res

        # Clean ALL the possible remnants found
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully cleaned transformation", transID)
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Cleaned")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Cleaned" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation",
                      "%s to Cleaned" % (transID))
        return S_OK()

    def cleanMetadataCatalogFiles(self, transID):
        """wipe out files from catalog"""
        res = self.metadataClient.findFilesByMetadata(
            {self.transfidmeta: transID})
        if not res["OK"]:
            return res
        fileToRemove = res["Value"]
        if not fileToRemove:
            self.log.info("No files found for transID", transID)
            return S_OK()

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(fileToRemove, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        for lfn, reason in res["Value"]["Failed"].items():
            self.log.error("Failed to remove file found in metadata catalog",
                           "%s %s" % (lfn, reason))
        if res["Value"]["Failed"]:
            return S_ERROR(
                "Failed to remove all files found in the metadata catalog")
        self.log.info("Successfully removed all files found in the DFC")
        return S_OK()

    #############################################################################
    #
    # These are the methods for removing the jobs from the WMS and transformation DB
    #

    def cleanTransformationTasks(self, transID):
        """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation"""
        self.log.verbose("Cleaning Transformation tasks of transformation",
                         transID)
        res = self.__getTransformationExternalIDs(transID)
        if not res["OK"]:
            return res
        externalIDs = res["Value"]
        if externalIDs:
            res = self.transClient.getTransformationParameters(
                transID, ["Type"])
            if not res["OK"]:
                self.log.error("Failed to determine transformation type")
                return res
            transType = res["Value"]
            if transType in self.dataProcTTypes:
                res = self.__removeWMSTasks(externalIDs)
            else:
                res = self.__removeRequests(externalIDs)
            if not res["OK"]:
                return res
        return S_OK()

    def __getTransformationExternalIDs(self, transID):
        """collect all ExternalIDs for transformation :transID:

        :param self: self reference
        :param int transID: transforamtion ID
        """
        res = self.transClient.getTransformationTasks(
            condDict={"TransformationID": transID})
        if not res["OK"]:
            self.log.error(
                "Failed to get externalIDs for transformation %d" % transID,
                res["Message"])
            return res
        externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]]
        self.log.info("Found %d tasks for transformation" % len(externalIDs))
        return S_OK(externalIDs)

    def __removeRequests(self, requestIDs):
        """This will remove requests from the RMS system -"""
        rIDs = [int(int(j)) for j in requestIDs if int(j)]
        for reqID in rIDs:
            self.reqClient.cancelRequest(reqID)

        return S_OK()

    def __removeWMSTasks(self, transJobIDs):
        """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system

        :param self: self reference
        :param list trasnJobIDs: job IDs
        """
        # Prevent 0 job IDs
        jobIDs = [int(j) for j in transJobIDs if int(j)]
        allRemove = True
        for jobList in breakListIntoChunks(jobIDs, 500):

            res = self.wmsClient.killJob(jobList)
            if res["OK"]:
                self.log.info("Successfully killed %d jobs from WMS" %
                              len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to kill jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to kill jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

            res = self.wmsClient.deleteJob(jobList)
            if res["OK"]:
                self.log.info("Successfully deleted jobs from WMS",
                              "(n=%d)" % len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to delete jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to delete jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

        if not allRemove:
            return S_ERROR("Failed to delete all remnants from WMS")
        self.log.info("Successfully deleted all tasks from the WMS")

        if not jobIDs:
            self.log.info(
                "JobIDs not present, unable to delete associated requests.")
            return S_OK()

        failed = 0
        failoverRequests = {}
        res = self.reqClient.getRequestIDsForJobs(jobIDs)
        if not res["OK"]:
            self.log.error("Failed to get requestID for jobs.", res["Message"])
            return res
        failoverRequests.update(res["Value"]["Successful"])
        if not failoverRequests:
            return S_OK()
        for jobID, requestID in res["Value"]["Successful"].items():
            # Put this check just in case, tasks must have associated jobs
            if jobID == 0 or jobID == "0":
                continue
            res = self.reqClient.cancelRequest(requestID)
            if not res["OK"]:
                self.log.error("Failed to remove request from RequestDB",
                               res["Message"])
                failed += 1
            else:
                self.log.verbose("Removed request %s associated to job %d." %
                                 (requestID, jobID))

        if failed:
            self.log.info("Successfully removed requests",
                          "(n=%d)" % (len(failoverRequests) - failed))
            self.log.info("Failed to remove requests", "(n=%d)" % failed)
            return S_ERROR("Failed to remove all the request from RequestDB")
        self.log.info(
            "Successfully removed all the associated failover requests")
        return S_OK()
Exemplo n.º 9
0
class ComponentSupervisionAgent(AgentModule):
    """ComponentSupervisionAgent class."""
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )

    def logError(self, errStr, varMsg=""):
        """Append errors to a list, which is sent in email notification."""
        self.log.error(errStr, varMsg)
        self.errors.append(errStr + " " + varMsg)

    def beginExecution(self):
        """Reload the configurations before every cycle."""
        self.setup = self.am_getOption("Setup", self.setup)
        self.enabled = self.am_getOption("EnableFlag", self.enabled)
        self.restartAgents = self.am_getOption("RestartAgents",
                                               self.restartAgents)
        self.restartExecutors = self.am_getOption("RestartExecutors",
                                                  self.restartExecutors)
        self.restartServices = self.am_getOption("RestartServices",
                                                 self.restartServices)
        self.addressTo = self.am_getOption("MailTo", self.addressTo)
        self.addressFrom = self.am_getOption("MailFrom", self.addressFrom)
        self.controlComponents = self.am_getOption("ControlComponents",
                                                   self.controlComponents)
        self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs)
        self.doNotRestartInstancePattern = self.am_getOption(
            "DoNotRestartInstancePattern", self.doNotRestartInstancePattern)

        self.csAPI = CSAPI()

        res = self.getRunningInstances(instanceType="Agents")
        if not res["OK"]:
            return S_ERROR("Failure to get running agents")
        self.agents = res["Value"]

        res = self.getRunningInstances(instanceType="Executors")
        if not res["OK"]:
            return S_ERROR("Failure to get running executors")
        self.executors = res["Value"]

        res = self.getRunningInstances(instanceType="Services")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]

        self.accounting.clear()
        return S_OK()

    def sendNotification(self):
        """Send email notification about changes done in the last cycle."""
        if not (self.errors or self.accounting):
            return S_OK()

        emailBody = ""
        rows = []
        for instanceName, val in self.accounting.items():
            rows.append([[instanceName],
                         [val.get("Treatment", "No Treatment")],
                         [str(val.get("LogAge", "Not Relevant"))]])

        if rows:
            columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
            emailBody += printTable(columns,
                                    rows,
                                    printOut=False,
                                    numbering=False,
                                    columnSeparator=" | ")

        if self.errors:
            emailBody += "\n\nErrors:"
            emailBody += "\n".join(self.errors)

        self.log.notice("Sending Email:\n" + emailBody)
        for address in self.addressTo:
            res = self.nClient.sendMail(address,
                                        self.emailSubject,
                                        emailBody,
                                        self.addressFrom,
                                        localAttempt=False)
            if not res["OK"]:
                self.log.error("Failure to send Email notification to ",
                               address)
                continue

        self.errors = []
        self.accounting.clear()

        return S_OK()

    def getRunningInstances(self, instanceType="Agents", runitStatus="Run"):
        """Return a dict of running agents, executors or services.

        Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

        :param str instanceType: 'Agents', 'Executors', 'Services'
        :param str runitStatus: Return only those instances with given RunitStatus or 'All'
        :returns: Dictionary of running instances
        """
        res = self.sysAdminClient.getOverallStatus()
        if not res["OK"]:
            self.logError(
                "Failure to get %s from system administrator client" %
                instanceType, res["Message"])
            return res

        val = res["Value"][instanceType]
        runningComponents = defaultdict(dict)
        for system, components in val.items():
            for componentName, componentInfo in components.items():
                if componentInfo["Setup"] and componentInfo["Installed"]:
                    if runitStatus != "All" and componentInfo[
                            "RunitStatus"] != runitStatus:
                        continue
                    for option, default in (("PollingTime", HOUR),
                                            ("Port", None), ("Protocol",
                                                             None)):
                        runningComponents[componentName][
                            option] = self._getComponentOption(
                                instanceType, system, componentName, option,
                                default)
                        # remove empty values so we can use defaults in _getURL
                        if not runningComponents[componentName][option]:
                            runningComponents[componentName].pop(option)
                    runningComponents[componentName][
                        "LogFileLocation"] = os.path.join(
                            self.diracLocation, "runit", system, componentName,
                            "log", "current")
                    runningComponents[componentName]["PID"] = componentInfo[
                        "PID"]
                    runningComponents[componentName]["Module"] = componentInfo[
                        "Module"]
                    runningComponents[componentName][
                        "RunitStatus"] = componentInfo["RunitStatus"]
                    runningComponents[componentName]["System"] = system

        return S_OK(runningComponents)

    def _getComponentOption(self, instanceType, system, componentName, option,
                            default):
        """Get component option from DIRAC CS, using components' base classes methods."""
        componentPath = PathFinder.getComponentSection(
            system=system,
            component=componentName,
            setup=self.setup,
            componentCategory=instanceType,
        )
        if instanceType != "Agents":
            return gConfig.getValue(Path.cfgPath(componentPath, option),
                                    default)
        # deal with agent configuration
        componentLoadModule = gConfig.getValue(
            Path.cfgPath(componentPath, "Module"), componentName)
        fullComponentName = Path.cfgPath(system, componentName)
        fullComponentLoadName = Path.cfgPath(system, componentLoadModule)
        return AgentModule(fullComponentName,
                           fullComponentLoadName).am_getOption(
                               option, default)

    def on_terminate(self, componentName, process):
        """Execute callback when a process terminates gracefully."""
        self.log.info(
            "%s's process with ID: %s has been terminated successfully" %
            (componentName, process.pid))

    def execute(self):
        """Execute checks for agents, executors, services."""
        for instanceType in ("executor", "agent", "service"):
            for name, options in getattr(self, instanceType + "s").items():
                # call checkAgent, checkExecutor, checkService
                res = getattr(self,
                              "check" + instanceType.capitalize())(name,
                                                                   options)
                if not res["OK"]:
                    self.logError("Failure when checking %s" % instanceType,
                                  "%s, %s" % (name, res["Message"]))

        res = self.componentControl()
        if not res["OK"]:
            if "Stopped does not exist" not in res[
                    "Message"] and "Running does not exist" not in res[
                        "Message"]:
                self.logError("Failure to control components", res["Message"])

        if not self.errors:
            res = self.checkURLs()
            if not res["OK"]:
                self.logError("Failure to check URLs", res["Message"])
        else:
            self.logError(
                "Something was wrong before, not checking URLs this time")

        self.sendNotification()

        if self.errors:
            return S_ERROR("Error during this cycle, check log")

        return S_OK()

    @staticmethod
    def getLastAccessTime(logFileLocation):
        """Return the age of log file."""
        lastAccessTime = 0
        try:
            lastAccessTime = os.path.getmtime(logFileLocation)
            lastAccessTime = datetime.fromtimestamp(lastAccessTime)
        except OSError as e:
            return S_ERROR("Failed to access logfile %s: %r" %
                           (logFileLocation, e))

        now = datetime.now()
        age = now - lastAccessTime
        return S_OK(age)

    def restartInstance(self, pid, instanceName, enabled):
        """Kill a process which is then restarted automatically."""
        if not (self.enabled and enabled):
            self.log.info(
                "Restarting is disabled, please restart %s manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        if any(pattern in instanceName
               for pattern in self.doNotRestartInstancePattern):
            self.log.info(
                "Restarting for %s is disabled, please restart it manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        try:
            componentProc = psutil.Process(int(pid))
            processesToTerminate = componentProc.children(recursive=True)
            processesToTerminate.append(componentProc)

            for proc in processesToTerminate:
                proc.terminate()

            _gone, alive = psutil.wait_procs(processesToTerminate,
                                             timeout=5,
                                             callback=partial(
                                                 self.on_terminate,
                                                 instanceName))
            for proc in alive:
                self.log.info("Forcefully killing process %s" % proc.pid)
                proc.kill()

            return S_OK()

        except psutil.Error as err:
            self.logError("Exception occurred in terminating processes",
                          "%s" % err)
            return S_ERROR()

    def checkService(self, serviceName, options):
        """Ping the service, restart if the ping does not respond."""
        url = self._getURL(serviceName, options)
        self.log.info("Pinging service", url)
        pingRes = Client().ping(url=url)
        if not pingRes["OK"]:
            self.log.info("Failure pinging service: %s: %s" %
                          (url, pingRes["Message"]))
            res = self.restartInstance(int(options["PID"]), serviceName,
                                       self.restartServices)
            if not res["OK"]:
                return res
            if res["Value"] != NO_RESTART:
                self.accounting[serviceName][
                    "Treatment"] = "Successfully Restarted"
                self.log.info("Service %s has been successfully restarted" %
                              serviceName)
        self.log.info("Service responded OK")
        return S_OK()

    def checkAgent(self, agentName, options):
        """Check the age of agent's log file, if it is too old then restart the agent."""
        pollingTime, currentLogLocation, pid = (options["PollingTime"],
                                                options["LogFileLocation"],
                                                options["PID"])
        self.log.info("Checking Agent: %s" % agentName)
        self.log.info("Polling Time: %s" % pollingTime)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (agentName, (age.seconds / MINUTES)))

        maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
        if age.seconds < maxLogAge:
            return S_OK()

        self.log.info("Current log file is too old for Agent %s" % agentName)
        self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

        res = self.restartInstance(int(pid), agentName, self.restartAgents)
        if not res["OK"]:
            return res
        if res["Value"] != NO_RESTART:
            self.accounting[agentName]["Treatment"] = "Successfully Restarted"
            self.log.info("Agent %s has been successfully restarted" %
                          agentName)

        return S_OK()

    def checkExecutor(self, executor, options):
        """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
        currentLogLocation = options["LogFileLocation"]
        pid = options["PID"]
        self.log.info("Checking executor: %s" % executor)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (executor, (age.seconds / MINUTES)))

        if age.seconds < 2 * HOUR:
            return S_OK()

        self.log.info("Current log file is too old for Executor %s" % executor)
        self.accounting[executor]["LogAge"] = age.seconds / MINUTES

        res = self.checkForCheckingJobs(executor)
        if not res["OK"]:
            return res
        if res["OK"] and res["Value"] == NO_CHECKING_JOBS:
            self.accounting.pop(executor, None)
            return S_OK(NO_RESTART)

        res = self.restartInstance(int(pid), executor, self.restartExecutors)
        if not res["OK"]:
            return res
        elif res["OK"] and res["Value"] != NO_RESTART:
            self.accounting[executor]["Treatment"] = "Successfully Restarted"
            self.log.info("Executor %s has been successfully restarted" %
                          executor)

        return S_OK()

    def checkForCheckingJobs(self, executorName):
        """Check if there are checking jobs with the **executorName** as current MinorStatus."""
        attrDict = {"Status": "Checking", "MinorStatus": executorName}

        # returns list of jobs IDs
        resJobs = self.jobMonClient.getJobs(attrDict)
        if not resJobs["OK"]:
            self.logError("Could not get jobs for this executor",
                          "%s: %s" % (executorName, resJobs["Message"]))
            return resJobs
        if resJobs["Value"]:
            self.log.info('Found %d jobs in "Checking" status for %s' %
                          (len(resJobs["Value"]), executorName))
            return S_OK(CHECKING_JOBS)
        self.log.info('Found no jobs in "Checking" status for %s' %
                      executorName)
        return S_OK(NO_CHECKING_JOBS)

    def componentControl(self):
        """Monitor and control component status as defined in the CS.

        Check for running and stopped components and ensure they have the proper status as defined in the CS
        Registry/Hosts/_HOST_/[Running|Stopped] sections

        :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
           :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
        """
        # get the current status of the components

        resCurrent = self._getCurrentComponentStatus()
        if not resCurrent["OK"]:
            return resCurrent
        currentStatus = resCurrent["Value"]

        resDefault = self._getDefaultComponentStatus()
        if not resDefault["OK"]:
            return resDefault
        defaultStatus = resDefault["Value"]

        # ensure instances are in the right state
        shouldBe = {}
        shouldBe["Run"] = defaultStatus["Run"].intersection(
            currentStatus["Down"])
        shouldBe["Down"] = defaultStatus["Down"].intersection(
            currentStatus["Run"])
        shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference(
            currentStatus["All"])

        self._ensureComponentRunning(shouldBe["Run"])
        self._ensureComponentDown(shouldBe["Down"])

        for instance in shouldBe["Unknown"]:
            self.logError("Unknown instance",
                          "%r, either uninstall or add to config" % instance)

        return S_OK()

    def _getCurrentComponentStatus(self):
        """Get current status for components."""
        resOverall = self.sysAdminClient.getOverallStatus()
        if not resOverall["OK"]:
            return resOverall
        currentStatus = {"Down": set(), "Run": set(), "All": set()}
        informationDict = resOverall["Value"]
        for systemsDict in informationDict.values():
            for system, instancesDict in systemsDict.items():
                for instanceName, instanceInfoDict in instancesDict.items():
                    identifier = "%s__%s" % (system, instanceName)
                    runitStatus = instanceInfoDict.get("RunitStatus")
                    if runitStatus in ("Run", "Down"):
                        currentStatus[runitStatus].add(identifier)

        currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"]
        return S_OK(currentStatus)

    def _getDefaultComponentStatus(self):
        """Get the configured status of the components."""
        host = socket.getfqdn()
        defaultStatus = {"Down": set(), "Run": set(), "All": set()}
        resRunning = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Running"))
        resStopped = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Stopped"))
        if not resRunning["OK"]:
            return resRunning
        if not resStopped["OK"]:
            return resStopped
        defaultStatus["Run"] = set(resRunning["Value"])
        defaultStatus["Down"] = set(resStopped["Value"])
        defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"]

        if defaultStatus["Run"].intersection(defaultStatus["Down"]):
            self.logError(
                "Overlap in configuration",
                str(defaultStatus["Run"].intersection(defaultStatus["Down"])))
            return S_ERROR("Bad host configuration")

        return S_OK(defaultStatus)

    def _ensureComponentRunning(self, shouldBeRunning):
        """Ensure the correct components are running."""
        for instance in shouldBeRunning:
            self.log.info("Starting instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.startComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to start component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was down, started instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is down, should be started"

    def _ensureComponentDown(self, shouldBeDown):
        """Ensure the correct components are not running."""
        for instance in shouldBeDown:
            self.log.info("Stopping instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.stopComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to stop component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was running, stopped instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is running, should be stopped"

    def checkURLs(self):
        """Ensure that the running services have their URL in the Config."""
        self.log.info("Checking URLs")
        # get services again, in case they were started/stop in controlComponents
        gConfig.forceRefresh(fromMaster=True)

        # get port used for https based services
        try:
            tornadoSystemInstance = PathFinder.getSystemInstance(
                system="Tornado",
                setup=self.setup,
            )
            self._tornadoPort = gConfig.getValue(
                Path.cfgPath("/System/Tornado/", tornadoSystemInstance,
                             "Port"),
                self._tornadoPort,
            )
        except RuntimeError:
            pass

        self.log.debug("Using Tornado Port:", self._tornadoPort)

        res = self.getRunningInstances(instanceType="Services",
                                       runitStatus="All")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]
        for service, options in sorted(self.services.items()):
            self.log.debug("Checking URL for %s with options %s" %
                           (service, options))
            # ignore SystemAdministrator, does not have URLs
            if "SystemAdministrator" in service:
                continue
            self._checkServiceURL(service, options)

        if self.csAPI.csModified and self.commitURLs:
            self.log.info("Commiting changes to the CS")
            result = self.csAPI.commit()
            if not result["OK"]:
                self.logError("Commit to CS failed", result["Message"])
                return S_ERROR("Failed to commit to CS")
        return S_OK()

    def _checkServiceURL(self, serviceName, options):
        """Ensure service URL is properly configured in the CS."""
        url = self._getURL(serviceName, options)
        system = options["System"]
        module = options["Module"]
        self.log.info("Checking URLs for %s/%s" % (system, module))
        urlsConfigPath = Path.cfgPath(
            PathFinder.getSystemURLSection(system=system, setup=self.setup),
            module)
        urls = gConfig.getValue(urlsConfigPath, [])
        self.log.debug("Found configured URLs for %s: %s" % (module, urls))
        self.log.debug("This URL is %s" % url)
        runitStatus = options["RunitStatus"]
        wouldHave = "Would have " if not self.commitURLs else ""
        if runitStatus == "Run" and url not in urls:
            urls.append(url)
            message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url,
                                                            system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
        if runitStatus == "Down" and url in urls:
            urls.remove(url)
            message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url,
                                                                system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

    def _getURL(self, serviceName, options):
        """Return URL for the service."""
        system = options["System"]
        port = options.get("Port", self._tornadoPort)
        host = socket.getfqdn()
        protocol = options.get("Protocol", "dips")
        url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName)
        return url
Exemplo n.º 10
0
class WorkflowTasks(TaskBase):
    """Handles jobs"""
    def __init__(
        self,
        transClient=None,
        logger=None,
        submissionClient=None,
        jobMonitoringClient=None,
        outputDataModule=None,
        jobClass=None,
        opsH=None,
        destinationPlugin=None,
        ownerDN=None,
        ownerGroup=None,
    ):
        """Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
        """

        if not logger:
            logger = gLogger.getSubLogger(self.__class__.__name__)

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                "Transformations/DestinationPlugin", "BySE")
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None

    def prepareTransformationTasks(self,
                                   transBody,
                                   taskDict,
                                   owner="",
                                   ownerGroup="",
                                   ownerDN="",
                                   bulkSubmissionFlag=False):
        """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
            jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.


        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param str owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation
        :param bool bulkSubmissionFlag: flag for using bulk submission or not

        :return: S_OK/S_ERROR with updated taskDict
        """

        if (not owner) or (not ownerGroup):
            res = getProxyInfo(False, False)
            if not res["OK"]:
                return res
            proxyInfo = res["Value"]
            owner = proxyInfo["username"]
            ownerGroup = proxyInfo["group"]

        if not ownerDN:
            res = getDNForUsername(owner)
            if not res["OK"]:
                return res
            ownerDN = res["Value"][0]

        if bulkSubmissionFlag:
            return self.__prepareTasksBulk(transBody, taskDict, owner,
                                           ownerGroup, ownerDN)
        # not a bulk submission
        return self.__prepareTasks(transBody, taskDict, owner, ownerGroup,
                                   ownerDN)

    def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup,
                           ownerDN):
        """Prepare transformation tasks with a single job object for bulk submission

        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param str owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation

        :return: S_OK/S_ERROR with updated taskDict
        """
        if taskDict:
            transID = list(taskDict.values())[0]["TransformationID"]
        else:
            return S_OK({})

        method = "__prepareTasksBulk"
        startTime = time.time()

        # Prepare the bulk Job object with common parameters
        oJob = self.jobClass(transBody)
        self._logVerbose("Setting job owner:group to %s:%s" %
                         (owner, ownerGroup),
                         transID=transID,
                         method=method)
        oJob.setOwner(owner)
        oJob.setOwnerGroup(ownerGroup)
        oJob.setOwnerDN(ownerDN)

        try:
            site = oJob.workflow.findParameter("Site").getValue()
        except AttributeError:
            site = None
        jobType = oJob.workflow.findParameter("JobType").getValue()
        transGroup = str(transID).zfill(8)

        # Verify that the JOB_ID parameter is added to the workflow
        if not oJob.workflow.findParameter("JOB_ID"):
            oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000",
                               "Initial JOB_ID")

        if oJob.workflow.findParameter("PRODUCTION_ID"):
            oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8))  # pylint: disable=protected-access
        else:
            oJob._addParameter(
                oJob.workflow,  # pylint: disable=protected-access
                "PRODUCTION_ID",
                "string",
                str(transID).zfill(8),
                "Production ID",
            )
        oJob.setType(jobType)
        self._logVerbose("Adding default transformation group of %s" %
                         (transGroup),
                         transID=transID,
                         method=method)
        oJob.setJobGroup(transGroup)

        clinicPath = self._checkSickTransformations(transID)
        if clinicPath:
            self._handleHospital(oJob, clinicPath)

        # Collect per job parameters sequences
        paramSeqDict = {}
        # tasks must be sorted because we use bulk submission and we must find the correspondance
        for taskID in sorted(taskDict):
            paramsDict = taskDict[taskID]
            seqDict = {}

            if site is not None:
                paramsDict["Site"] = site
            paramsDict["JobType"] = jobType

            # Handle destination site
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError("Could not get a list a sites",
                               transID=transID,
                               method=method)
                return S_ERROR(ETSUKN, "Can not evaluate destination site")
            else:
                self._logVerbose("Setting Site: ",
                                 str(sites),
                                 transID=transID,
                                 method=method)
                seqDict["Site"] = sites

            seqDict["JobName"] = self._transTaskName(transID, taskID)
            seqDict["JOB_ID"] = str(taskID).zfill(8)

            self._logDebug(
                "TransID: %s, TaskID: %s, paramsDict: %s" %
                (transID, taskID, str(paramsDict)),
                transID=transID,
                method=method,
            )

            # Handle Input Data
            inputData = paramsDict.get("InputData")
            if inputData:
                if isinstance(inputData, six.string_types):
                    inputData = inputData.replace(" ", "").split(";")
                self._logVerbose("Setting input data to %s" % inputData,
                                 transID=transID,
                                 method=method)
                seqDict["InputData"] = inputData
            elif paramSeqDict.get("InputData") is not None:
                self._logError(
                    "Invalid mixture of jobs with and without input data")
                return S_ERROR(
                    ETSDATA,
                    "Invalid mixture of jobs with and without input data")

            for paramName, paramValue in paramsDict.items():
                if paramName not in ("InputData", "Site", "TargetSE"):
                    if paramValue:
                        self._logVerbose("Setting %s to %s" %
                                         (paramName, paramValue),
                                         transID=transID,
                                         method=method)
                        seqDict[paramName] = paramValue

            outputParameterList = []
            if self.outputDataModule:
                res = self.getOutputData({
                    "Job": oJob._toXML(),  # pylint: disable=protected-access
                    "TransformationID": transID,
                    "TaskID": taskID,
                    "InputData": inputData,
                })
                if not res["OK"]:
                    self._logError("Failed to generate output data",
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res["Value"].items():
                    seqDict[name] = output
                    outputParameterList.append(name)
                    if oJob.workflow.findParameter(name):
                        oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
                    else:
                        oJob._addParameter(
                            oJob.workflow,
                            name,
                            "JDL",
                            "%%(%s)s" % name,
                            name  # pylint: disable=protected-access
                        )

            for pName, seq in seqDict.items():
                paramSeqDict.setdefault(pName, []).append(seq)

        for paramName, paramSeq in paramSeqDict.items():
            if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData"
                             ] + outputParameterList:
                res = oJob.setParameterSequence(paramName,
                                                paramSeq,
                                                addToWorkflow=paramName)
            else:
                res = oJob.setParameterSequence(paramName, paramSeq)
            if not res["OK"]:
                return res

        if taskDict:
            self._logInfo("Prepared %d tasks" % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)

        taskDict["BulkJobObject"] = oJob
        return S_OK(taskDict)

    def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
        """Prepare transformation tasks with a job object per task

        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation

        :return:  S_OK/S_ERROR with updated taskDict
        """
        if taskDict:
            transID = list(taskDict.values())[0]["TransformationID"]
        else:
            return S_OK({})

        method = "__prepareTasks"
        startTime = time.time()

        oJobTemplate = self.jobClass(transBody)
        oJobTemplate.setOwner(owner)
        oJobTemplate.setOwnerGroup(ownerGroup)
        oJobTemplate.setOwnerDN(ownerDN)

        try:
            site = oJobTemplate.workflow.findParameter("Site").getValue()
        except AttributeError:
            site = None
        jobType = oJobTemplate.workflow.findParameter("JobType").getValue()
        templateOK = False
        getOutputDataTiming = 0.0

        for taskID, paramsDict in taskDict.items():
            # Create a job for each task and add it to the taskDict
            if not templateOK:
                templateOK = True
                # Update the template with common information
                self._logVerbose("Job owner:group to %s:%s" %
                                 (owner, ownerGroup),
                                 transID=transID,
                                 method=method)
                transGroup = str(transID).zfill(8)
                self._logVerbose("Adding default transformation group of %s" %
                                 (transGroup),
                                 transID=transID,
                                 method=method)
                oJobTemplate.setJobGroup(transGroup)
                if oJobTemplate.workflow.findParameter("PRODUCTION_ID"):
                    oJobTemplate._setParamValue("PRODUCTION_ID",
                                                str(transID).zfill(8))
                else:
                    oJobTemplate._addParameter(oJobTemplate.workflow,
                                               "PRODUCTION_ID", "string",
                                               str(transID).zfill(8),
                                               "Production ID")
                if not oJobTemplate.workflow.findParameter("JOB_ID"):
                    oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID",
                                               "string", "00000000",
                                               "Initial JOB_ID")

            if site is not None:
                paramsDict["Site"] = site
            paramsDict["JobType"] = jobType
            # Now create the job from the template
            oJob = copy.deepcopy(oJobTemplate)
            constructedName = self._transTaskName(transID, taskID)
            self._logVerbose("Setting task name to %s" % constructedName,
                             transID=transID,
                             method=method)
            oJob.setName(constructedName)
            oJob._setParamValue("JOB_ID", str(taskID).zfill(8))
            inputData = None

            self._logDebug(
                "TransID: %s, TaskID: %s, paramsDict: %s" %
                (transID, taskID, str(paramsDict)),
                transID=transID,
                method=method,
            )

            # These helper functions do the real job
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError("Could not get a list a sites",
                               transID=transID,
                               method=method)
                paramsDict["TaskObject"] = ""
                continue
            else:
                self._logDebug("Setting Site: ",
                               str(sites),
                               transID=transID,
                               method=method)
                res = oJob.setDestination(sites)
                if not res["OK"]:
                    self._logError("Could not set the site: %s" %
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    paramsDict["TaskObject"] = ""
                    continue

            self._handleInputs(oJob, paramsDict)
            self._handleRest(oJob, paramsDict)

            clinicPath = self._checkSickTransformations(transID)
            if clinicPath:
                self._handleHospital(oJob, clinicPath)

            paramsDict["TaskObject"] = ""
            if self.outputDataModule:
                getOutputDataTiming -= time.time()
                res = self.getOutputData({
                    "Job": oJob._toXML(),
                    "TransformationID": transID,
                    "TaskID": taskID,
                    "InputData": inputData
                })
                getOutputDataTiming += time.time()
                if not res["OK"]:
                    self._logError("Failed to generate output data",
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res["Value"].items():
                    oJob._addJDLParameter(name, ";".join(output))
            paramsDict["TaskObject"] = oJob
        if taskDict:
            self._logVerbose(
                "Average getOutputData time: %.1f per task" %
                (getOutputDataTiming / len(taskDict)),
                transID=transID,
                method=method,
            )
            self._logInfo("Prepared %d tasks" % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)
        return S_OK(taskDict)

    #############################################################################

    def _handleDestination(self, paramsDict):
        """Handle Sites and TargetSE in the parameters"""

        try:
            sites = ["ANY"]
            if paramsDict["Site"]:
                # 'Site' comes from the XML and therefore is ; separated
                sites = fromChar(paramsDict["Site"], sepChar=";")
        except KeyError:
            pass

        if self.destinationPlugin_o:
            destinationPlugin_o = self.destinationPlugin_o
        else:
            res = self.__generatePluginObject(self.destinationPlugin)
            if not res["OK"]:
                self._logFatal(
                    "Could not generate a destination plugin object")
                return res
            destinationPlugin_o = res["Value"]
            self.destinationPlugin_o = destinationPlugin_o

        destinationPlugin_o.setParameters(paramsDict)
        destSites = destinationPlugin_o.run()
        if not destSites:
            return sites

        # Now we need to make the AND with the sites, if defined
        if sites != ["ANY"]:
            # Need to get the AND
            destSites &= set(sites)

        return list(destSites)

    def _handleInputs(self, oJob, paramsDict):
        """set job inputs (+ metadata)"""
        inputData = paramsDict.get("InputData")
        transID = paramsDict["TransformationID"]
        if inputData:
            self._logVerbose("Setting input data to %s" % inputData,
                             transID=transID,
                             method="_handleInputs")
            res = oJob.setInputData(inputData)
            if not res["OK"]:
                self._logError("Could not set the inputs: %s" % res["Message"],
                               transID=transID,
                               method="_handleInputs")

    def _handleRest(self, oJob, paramsDict):
        """add as JDL parameters all the other parameters that are not for inputs or destination"""
        transID = paramsDict["TransformationID"]
        for paramName, paramValue in paramsDict.items():
            if paramName not in ("InputData", "Site", "TargetSE"):
                if paramValue:
                    self._logDebug("Setting %s to %s" %
                                   (paramName, paramValue),
                                   transID=transID,
                                   method="_handleRest")
                    oJob._addJDLParameter(paramName, paramValue)

    def _checkSickTransformations(self, transID):
        """Check if the transformation is in the transformations to be processed at Hospital or Clinic"""
        transID = int(transID)
        clinicPath = "Hospital"
        if transID in set(
                int(x) for x in self.opsH.getValue(
                    os.path.join(clinicPath, "Transformations"), [])):
            return clinicPath
        if "Clinics" in self.opsH.getSections("Hospital").get("Value", []):
            basePath = os.path.join("Hospital", "Clinics")
            clinics = self.opsH.getSections(basePath)["Value"]
            for clinic in clinics:
                clinicPath = os.path.join(basePath, clinic)
                if transID in set(
                        int(x) for x in self.opsH.getValue(
                            os.path.join(clinicPath, "Transformations"), [])):
                    return clinicPath
        return None

    def _handleHospital(self, oJob, clinicPath):
        """Optional handle of hospital/clinic jobs"""
        if not clinicPath:
            return
        oJob.setInputDataPolicy("download", dataScheduling=False)

        # Check first for a clinic, if not it must be the general hospital
        hospitalSite = self.opsH.getValue(
            os.path.join(clinicPath, "ClinicSite"), "")
        hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"),
                                         [])
        # If not found, get the hospital parameters
        if not hospitalSite:
            hospitalSite = self.opsH.getValue("Hospital/HospitalSite",
                                              "DIRAC.JobDebugger.ch")
        if not hospitalCEs:
            hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])

        oJob.setDestination(hospitalSite)
        if hospitalCEs:
            oJob._addJDLParameter("GridCE", hospitalCEs)

    def __generatePluginObject(self, plugin):
        """This simply instantiates the TaskManagerPlugin class with the relevant plugin name"""
        method = "__generatePluginObject"
        try:
            plugModule = __import__(self.pluginLocation, globals(), locals(),
                                    ["TaskManagerPlugin"])
        except ImportError as e:
            self._logException("Failed to import 'TaskManagerPlugin' %s: %s" %
                               (plugin, e),
                               method=method)
            return S_ERROR()
        try:
            plugin_o = getattr(plugModule,
                               "TaskManagerPlugin")("%s" % plugin,
                                                    operationsHelper=self.opsH)
            return S_OK(plugin_o)
        except AttributeError as e:
            self._logException("Failed to create %s(): %s." % (plugin, e),
                               method=method)
            return S_ERROR()

    #############################################################################

    def getOutputData(self, paramDict):
        """Get the list of job output LFNs from the provided plugin"""
        if not self.outputDataModule_o:
            # Create the module object
            moduleFactory = ModuleFactory()

            moduleInstance = moduleFactory.getModule(self.outputDataModule,
                                                     None)
            if not moduleInstance["OK"]:
                return moduleInstance
            self.outputDataModule_o = moduleInstance["Value"]
        # This is the "argument" to the module, set it and then execute
        self.outputDataModule_o.paramDict = paramDict
        return self.outputDataModule_o.execute()

    def submitTransformationTasks(self, taskDict):
        """Submit the tasks"""
        if "BulkJobObject" in taskDict:
            return self.__submitTransformationTasksBulk(taskDict)
        return self.__submitTransformationTasks(taskDict)

    def __submitTransformationTasksBulk(self, taskDict):
        """Submit jobs in one go with one parametric job"""
        if not taskDict:
            return S_OK(taskDict)
        startTime = time.time()

        method = "__submitTransformationTasksBulk"

        oJob = taskDict.pop("BulkJobObject")
        # we can only do this, once the job has been popped, or we _might_ crash
        transID = list(taskDict.values())[0]["TransformationID"]
        if oJob is None:
            self._logError("no bulk Job object found",
                           transID=transID,
                           method=method)
            return S_ERROR(ETSUKN,
                           "No bulk job object provided for submission")

        result = self.submitTaskToExternal(oJob)
        if not result["OK"]:
            self._logError("Failed to submit tasks to external",
                           transID=transID,
                           method=method)
            return result

        jobIDList = result["Value"]
        if len(jobIDList) != len(taskDict):
            for task in taskDict.values():
                task["Success"] = False
            return S_ERROR(
                ETSUKN, "Submitted less number of jobs than requested tasks")
        # Get back correspondence with tasks sorted by ID
        for jobID, taskID in zip(jobIDList, sorted(taskDict)):
            taskDict[taskID]["ExternalID"] = jobID
            taskDict[taskID]["Success"] = True

        submitted = len(jobIDList)
        self._logInfo(
            "Submitted %d tasks to WMS in %.1f seconds" %
            (submitted, time.time() - startTime),
            transID=transID,
            method=method,
        )
        return S_OK(taskDict)

    def __submitTransformationTasks(self, taskDict):
        """Submit jobs one by one"""
        method = "__submitTransformationTasks"
        submitted = 0
        failed = 0
        startTime = time.time()
        for task in taskDict.values():
            transID = task["TransformationID"]
            if not task["TaskObject"]:
                task["Success"] = False
                failed += 1
                continue
            res = self.submitTaskToExternal(task["TaskObject"])
            if res["OK"]:
                task["ExternalID"] = res["Value"]
                task["Success"] = True
                submitted += 1
            else:
                self._logError("Failed to submit task to WMS",
                               res["Message"],
                               transID=transID,
                               method=method)
                task["Success"] = False
                failed += 1
        if submitted:
            self._logInfo(
                "Submitted %d tasks to WMS in %.1f seconds" %
                (submitted, time.time() - startTime),
                transID=transID,
                method=method,
            )
        if failed:
            self._logError("Failed to submit %d tasks to WMS." % (failed),
                           transID=transID,
                           method=method)
        return S_OK(taskDict)

    def submitTaskToExternal(self, job):
        """Submits a single job (which can be a bulk one) to the WMS."""
        if isinstance(job, six.string_types):
            try:
                oJob = self.jobClass(job)
            except Exception as x:  # pylint: disable=broad-except
                self._logException("Failed to create job object", "", x)
                return S_ERROR("Failed to create job object")
        elif isinstance(job, self.jobClass):
            oJob = job
        else:
            self._logError("No valid job description found")
            return S_ERROR("No valid job description found")

        workflowFileObject = StringIO(oJob._toXML())
        jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
        return self.submissionClient.submitJob(jdl, workflowFileObject)

    def updateTransformationReservedTasks(self, taskDicts):
        transID = None
        jobNames = [
            self._transTaskName(taskDict["TransformationID"],
                                taskDict["TaskID"]) for taskDict in taskDicts
        ]
        res = self.jobMonitoringClient.getJobs({"JobName": jobNames})
        if not res["OK"]:
            self._logError(
                "Failed to get task from WMS",
                res["Message"],
                transID=transID,
                method="updateTransformationReservedTasks",
            )
            return res
        jobNameIDs = {}
        for wmsID in res["Value"]:
            res = self.jobMonitoringClient.getJobSummary(int(wmsID))
            if not res["OK"]:
                self._logWarn(
                    "Failed to get task summary from WMS",
                    res["Message"],
                    transID=transID,
                    method="updateTransformationReservedTasks",
                )
            else:
                jobNameIDs[res["Value"]["JobName"]] = int(wmsID)

        noTask = list(set(jobNames) - set(jobNameIDs))
        return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs})

    def getSubmittedTaskStatus(self, taskDicts):
        """
        Check the status of a list of tasks and return lists of taskIDs for each new status
        """
        method = "getSubmittedTaskStatus"

        if taskDicts:
            wmsIDs = [
                int(taskDict["ExternalID"]) for taskDict in taskDicts
                if int(taskDict["ExternalID"])
            ]
            transID = taskDicts[0]["TransformationID"]
        else:
            return S_OK({})
        res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
        if not res["OK"]:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res["Value"]
        updateDict = {}
        for taskDict in taskDicts:
            taskID = taskDict["TaskID"]
            wmsID = int(taskDict["ExternalID"])
            if not wmsID:
                continue
            oldStatus = taskDict["ExternalStatus"]
            newStatus = statusDict.get(wmsID, {}).get("Status", "Removed")
            if oldStatus != newStatus:
                if newStatus == "Removed":
                    self._logVerbose(
                        "Production/Job %d/%d removed from WMS while it is in %s status"
                        % (transID, taskID, oldStatus),
                        transID=transID,
                        method=method,
                    )
                    newStatus = "Failed"
                self._logVerbose(
                    "Setting job status for Production/Job %d/%d to %s" %
                    (transID, taskID, newStatus),
                    transID=transID,
                    method=method,
                )
                updateDict.setdefault(newStatus, []).append(taskID)
        return S_OK(updateDict)

    def getSubmittedFileStatus(self, fileDicts):
        """
        Check the status of a list of files and return the new status of each LFN
        """
        if not fileDicts:
            return S_OK({})

        method = "getSubmittedFileStatus"

        # All files are from the same transformation
        transID = fileDicts[0]["TransformationID"]
        taskFiles = {}
        for fileDict in fileDicts:
            jobName = self._transTaskName(transID, fileDict["TaskID"])
            taskFiles.setdefault(jobName,
                                 {})[fileDict["LFN"]] = fileDict["Status"]

        res = self.updateTransformationReservedTasks(fileDicts)
        if not res["OK"]:
            self._logWarn("Failed to obtain taskIDs for files",
                          transID=transID,
                          method=method)
            return res
        noTasks = res["Value"]["NoTasks"]
        taskNameIDs = res["Value"]["TaskNameIDs"]

        updateDict = {}
        for jobName in noTasks:
            for lfn, oldStatus in taskFiles[jobName].items():
                if oldStatus != TransformationFilesStatus.UNUSED:
                    updateDict[lfn] = TransformationFilesStatus.UNUSED

        res = self.jobMonitoringClient.getJobsStatus(list(
            taskNameIDs.values()))
        if not res["OK"]:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res["Value"]
        for jobName, wmsID in taskNameIDs.items():
            jobStatus = statusDict.get(wmsID, {}).get("Status")
            newFileStatus = {
                "Done": TransformationFilesStatus.PROCESSED,
                "Completed": TransformationFilesStatus.PROCESSED,
                "Failed": TransformationFilesStatus.UNUSED,
            }.get(jobStatus)
            if newFileStatus:
                for lfn, oldStatus in taskFiles[jobName].items():
                    if newFileStatus != oldStatus:
                        updateDict[lfn] = newFileStatus
        return S_OK(updateDict)
Exemplo n.º 11
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """# Now, let's submit some jobs. Different sites, types, inputs"""
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [["/a/1.txt", "/a/2.txt"],
                 ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []]
        types = ["User", "Test"]
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination("DIRAC.Jenkins.ch")
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res["OK"], res.get("Message"))
                jobID = res["Value"]
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"},
            msg="Got %s" % res["Value"])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(sorted(res["Value"]),
                         sorted(types),
                         msg="Got %s" % str(sorted(res["Value"])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], ["app status", "Unknown"],
                         msg="Got %s" % str(res["Value"]))

        res = jobMonitor.getOwners()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_empty = res["Value"]
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanNow = res["Value"]
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanOneYear = res["Value"]
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)),
            resJG_olderThanOneYear)
        res = jobMonitor.getStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"])
            in [[JobStatus.RECEIVED],
                sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"]) in [
                ["Job accepted"],
                sorted(["Job accepted", "Job Rescheduled"]),
                sorted(["Job accepted", "Marked for termination"]),
            ],
            res["Value"],
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobs()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"])
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow()): {
                    "Status": JobStatus.CHECKING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                }
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus")

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)):
                {
                    "Status": JobStatus.WAITING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                },
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)):
                {
                    "Status": JobStatus.MATCHED,
                    "MinorStatus": "MinorStatus-matched",
                    "Source": "Unknown",
                },
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]})
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobAttribute(jobID, "Status",
                                             JobStatus.RUNNING)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING)

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)