Пример #1
0
    def initialize(self):
        """agent initialisation

        reading and setting config opts

        :param self: self reference
        """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption("shifterProxy",
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            "Transformations/DataProcessing", self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            "Transformations/DataManipulation", self.dataManipTTypes)
        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption("TransfIDMeta",
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption("ArchiveAfter",
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()
        # # job monitoring client
        self.jobMonitoringClient = JobMonitoringClient()

        return S_OK()
Пример #2
0
 def initialize(self):
     """ Initialize the agent.
 """
     self.am_setOption("PollingTime", 60)
     self.ovc = OverlaySystemClient()
     self.jobmon = JobMonitoringClient()
     return S_OK()
Пример #3
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None,
                 destinationPlugin=None,
                 ownerDN=None,
                 ownerGroup=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                'Transformations/DestinationPlugin', 'BySE')
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None
Пример #4
0
  def deleteJobOversizedSandbox(self, jobIDList):
    """ Delete the job oversized sandbox files from storage elements
    """

    failed = {}
    successful = {}

    result = JobMonitoringClient().getJobParameters(jobIDList, 'OutputSandboxLFN')
    if not result['OK']:
      return result
    osLFNList = result['Value']
    if not osLFNList:
      return S_OK({'Successful': successful, 'Failed': failed})

    # Schedule removal of the LFNs now
    for jobID, outputSandboxLFNdict in osLFNList.iteritems():
      lfn = outputSandboxLFNdict['OutputSandboxLFN']
      result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn

    result = {'Successful': successful, 'Failed': failed}
    return S_OK(result)
Пример #5
0
def checkJobStateTransition(jobID,
                            candidateState,
                            currentStatus=None,
                            jobMonitoringClient=None):
    """Utility to check if a job state transition is allowed"""
    if not currentStatus:
        if not jobMonitoringClient:
            from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient

            jobMonitoringClient = JobMonitoringClient()

        res = jobMonitoringClient.getJobsStatus(jobID)
        if not res["OK"]:
            return res
        try:
            currentStatus = res["Value"][jobID]["Status"]
        except KeyError:
            return S_ERROR("Job does not exist")

    res = JobsStateMachine(currentStatus).getNextState(candidateState)
    if not res["OK"]:
        return res

    # If the JobsStateMachine does not accept the candidate, return an ERROR
    if candidateState != res["Value"]:
        gLogger.error(
            "Job Status Error",
            "%s can't move from %s to %s" %
            (jobID, currentStatus, candidateState),
        )
        return S_ERROR("Job state transition not allowed")
    return S_OK()
Пример #6
0
  def test_ParametricChain(self):
    """ This test will submit a parametric job which should generate 3 actual jobs
    """
    wmsClient = WMSClient()
    jobStateUpdate = JobStateUpdateClient()
    jobMonitor = JobMonitoringClient()

    # create the job
    job = parametricJob()
    jobDescription = createFile(job)

    # submit the job
    result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    self.assertTrue(result['OK'])
    jobIDList = result['Value']
    self.assertEqual(len(jobIDList), 3)

    result = jobMonitor.getJobsParameters(jobIDList, ['JobName'])
    self.assertTrue(result['OK'])
    jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']]
    self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)]))

    for jobID in jobIDList:
      result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
      self.assertTrue(result['OK'])

    result = wmsClient.deleteJob(jobIDList)
    self.assertTrue(result['OK'])

    for jobID in jobIDList:
      result = jobMonitor.getJobStatus(jobID)
      self.assertTrue(result['OK'])
      self.assertEqual(result['Value'], 'Deleted')
Пример #7
0
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )
Пример #8
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        if not submissionClient:
            from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
            self.submissionClient = WMSClient()
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not outputDataModule:
            self.outputDataModule = gConfig.getValue(
                "/DIRAC/VOPolicy/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not jobClass:
            from DIRAC.Interfaces.API.Job import Job
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
            self.opsH = Operations()
        else:
            self.opsH = opsH
Пример #9
0
    def deleteJobOversizedSandbox(self, jobIDList):
        """ Delete the job oversized sandbox files from storage elements
    """

        failed = {}
        successful = {}

        lfnDict = {}
        for jobID in jobIDList:
            result = JobMonitoringClient().getJobParameter(
                jobID, 'OutputSandboxLFN')
            if result['OK']:
                lfn = result['Value'].get('OutputSandboxLFN')
                if lfn:
                    lfnDict[lfn] = jobID
                else:
                    successful[jobID] = 'No oversized sandbox found'
            else:
                gLogger.error('Error interrogating JobDB: %s' %
                              result['Message'])

        if not lfnDict:
            return S_OK({'Successful': successful, 'Failed': failed})

        # Schedule removal of the LFNs now

        for lfn, jobID in lfnDict.items():
            result = self.jobDB.getJobAttributes(jobID,
                                                 ['OwnerDN', 'OwnerGroup'])
            if not result['OK']:
                failed[jobID] = lfn
                continue
            if not result['Value']:
                failed[jobID] = lfn
                continue

            ownerDN = result['Value']['OwnerDN']
            ownerGroup = result['Value']['OwnerGroup']
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result['OK']:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {'Successful': successful, 'Failed': failed}
        return S_OK(result)
Пример #10
0
    def deleteJobOversizedSandbox(self, jobIDList):
        """
        Deletes the job oversized sandbox files from storage elements.
        Creates a request in RMS if not immediately possible.

        :param list jobIDList: list of job IDs
        :returns: S_OK/S_ERROR
        """

        failed = {}
        successful = {}

        result = JobMonitoringClient().getJobParameters(
            jobIDList, ["OutputSandboxLFN"])
        if not result["OK"]:
            return result
        osLFNDict = result["Value"]
        if not osLFNDict:
            return S_OK({"Successful": successful, "Failed": failed})
        osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1])

        self.log.verbose("Deleting oversized sandboxes", osLFNDict)
        # Schedule removal of the LFNs now
        for jobID, outputSandboxLFNdict in osLFNDict.items(
        ):  # can be an iterator
            lfn = outputSandboxLFNdict["OutputSandboxLFN"]
            result = self.jobDB.getJobAttributes(jobID,
                                                 ["OwnerDN", "OwnerGroup"])
            if not result["OK"]:
                failed[jobID] = lfn
                continue
            if not result["Value"]:
                failed[jobID] = lfn
                continue

            ownerDN = result["Value"]["OwnerDN"]
            ownerGroup = result["Value"]["OwnerGroup"]
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result["OK"]:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {"Successful": successful, "Failed": failed}
        return S_OK(result)
Пример #11
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        if not submissionClient:
            self.submissionClient = WMSClient()
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule
Пример #12
0
  def __getJobPilotStatus(self, jobID):
    """ Get the job pilot status
    """
    result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference')
    if not result['OK']:
      return result
    pilotReference = result['Value'].get('Pilot_Reference', 'Unknown')
    if pilotReference == 'Unknown':
      # There is no pilot reference, hence its status is unknown
      return S_OK('NoPilot')

    result = PilotManagerClient().getPilotInfo(pilotReference)
    if not result['OK']:
      if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
        self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message']))
        return S_OK('NoPilot')
      self.log.error('Failed to get pilot information',
                     'for job %d: %s' % (jobID, result['Message']))
      return result
    pilotStatus = result['Value'][pilotReference]['Status']

    return S_OK(pilotStatus)
Пример #13
0
    def test_ParametricChain(self):
        """This test will submit a parametric job which should generate 3 actual jobs"""
        wmsClient = WMSClient()
        jobStateUpdate = JobStateUpdateClient()
        jobMonitor = JobMonitoringClient()

        # create the job
        job = parametricJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        jobIDList = res["Value"]
        self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList))

        res = jobMonitor.getJobsParameters(jobIDList, ["JobName"])
        self.assertTrue(res["OK"], res.get("Message"))
        jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]]
        self.assertEqual(
            set(jobNames),
            set(["parametric_helloWorld_%s" % nJob for nJob in range(3)]))

        for jobID in jobIDList:
            res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING,
                                              "checking", "source")
            self.assertTrue(res["OK"], res.get("Message"))

        res = wmsClient.deleteJob(jobIDList)
        self.assertTrue(res["OK"], res.get("Message"))
        print(res)

        for jobID in jobIDList:
            res = jobMonitor.getJobsStatus(jobID)
            self.assertTrue(res["OK"], res.get("Message"))
            self.assertEqual(res["Value"][jobID]["Status"],
                             JobStatus.DELETED,
                             msg="Got %s" % str(res["Value"]))
Пример #14
0
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
    """
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       'Pilot_Reference')
        if not result['OK']:
            return result
        pilotReference = result['Value'].get('Pilot_Reference')
        if not pilotReference:
            # There is no pilot reference, hence its status is unknown
            return S_OK('NoPilot')

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result['OK']:
            if "No pilots found" in result['Message']:
                self.log.warn(result['Message'])
                return S_OK('NoPilot')
            self.log.error('Failed to get pilot information',
                           'for job %d: ' % jobID + result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)
Пример #15
0
    def _getJobPilotStatus(self, jobID):
        """Get the job pilot status"""
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       "Pilot_Reference")
        if not result["OK"]:
            return result
        pilotReference = result["Value"].get("Pilot_Reference", "Unknown")
        if pilotReference == "Unknown":
            # There is no pilot reference, hence its status is unknown
            return S_OK("NoPilot")

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result["OK"]:
            if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
                self.log.warn("No pilot found",
                              "for job %d: %s" % (jobID, result["Message"]))
                return S_OK("NoPilot")
            self.log.error("Failed to get pilot information",
                           "for job %d: %s" % (jobID, result["Message"]))
            return result
        pilotStatus = result["Value"][pilotReference]["Status"]

        return S_OK(pilotStatus)
Пример #16
0
    def __call__(self):
        """request processing"""

        self.log.debug("about to execute request")
        if not self.rmsMonitoring:
            gMonitor.addMark("RequestAtt", 1)

        # # setup proxy for request owner
        setupProxy = self.setupProxy()
        if not setupProxy["OK"]:
            userSuspended = "User is currently suspended"
            self.request.Error = setupProxy["Message"]
            # In case the user does not have proxy
            if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND):
                self.log.error("Error setting proxy. Request set to Failed:",
                               setupProxy["Message"])
                # If user is no longer registered, fail the request
                for operation in self.request:
                    for opFile in operation:
                        opFile.Status = "Failed"
                    operation.Status = "Failed"
            elif userSuspended in setupProxy["Message"]:
                # If user is suspended, wait for a long time
                self.request.delayNextExecution(6 * 60)
                self.request.Error = userSuspended
                self.log.error("Error setting proxy: " + userSuspended,
                               self.request.OwnerDN)
            else:
                self.log.error("Error setting proxy", setupProxy["Message"])
            return S_OK(self.request)
        shifter = setupProxy["Value"]["Shifter"]

        error = None

        while self.request.Status == "Waiting":

            # # get waiting operation
            operation = self.request.getWaiting()
            if not operation["OK"]:
                self.log.error("Cannot get waiting operation",
                               operation["Message"])
                return operation
            operation = operation["Value"]
            self.log.info("executing operation", "%s" % operation.Type)

            # # and handler for it
            handler = self.getHandler(operation)
            if not handler["OK"]:
                self.log.error("Unable to process operation",
                               "%s: %s" % (operation.Type, handler["Message"]))
                # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 )
                operation.Error = handler["Message"]
                break

            handler = handler["Value"]
            # # set shifters list in the handler
            handler.shifter = shifter
            # set rmsMonitoring flag for the RequestOperation
            handler.rmsMonitoring = self.rmsMonitoring
            # # and execute
            pluginName = self.getPluginName(
                self.handlersDict.get(operation.Type))
            if self.standalone:
                useServerCertificate = gConfig.useServerCertificate()
            else:
                # Always use server certificates if executed within an agent
                useServerCertificate = True
            try:
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Attempted",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Att"), 1)
                # Always use request owner proxy
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "false")
                exe = handler()
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                if not exe["OK"]:
                    self.log.error("unable to process operation",
                                   "%s: %s" % (operation.Type, exe["Message"]))
                    if pluginName:
                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord({
                                "timestamp":
                                int(Time.toEpoch()),
                                "host":
                                Network.getFQDN(),
                                "objectType":
                                "Operation",
                                "operationType":
                                pluginName,
                                "objectID":
                                operation.OperationID,
                                "parentID":
                                operation.RequestID,
                                "status":
                                "Failed",
                                "nbObject":
                                1,
                            })
                        else:
                            gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Request",
                            "objectID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("RequestFail", 1)

                    if self.request.JobID:
                        # Check if the job exists
                        monitorServer = JobMonitoringClient(
                            useCertificates=True)
                        res = monitorServer.getJobSummary(
                            int(self.request.JobID))
                        if not res["OK"]:
                            self.log.error(
                                "RequestTask: Failed to get job status",
                                "%d" % self.request.JobID)
                        elif not res["Value"]:
                            self.log.warn(
                                "RequestTask: job does not exist (anymore): failed request",
                                "JobID: %d" % self.request.JobID,
                            )
                            for opFile in operation:
                                opFile.Status = "Failed"
                            if operation.Status != "Failed":
                                operation.Status = "Failed"
                            self.request.Error = "Job no longer exists"
            except Exception as e:
                error = str(e)
                self.log.exception("hit by exception:", "%s" % error)
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Request",
                        "objectID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("RequestFail", 1)

                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                break

            # # operation status check
            if operation.Status == "Done" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Successful",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "OK"), 1)
            elif operation.Status == "Failed" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
            elif operation.Status in ("Waiting", "Scheduled"):
                # # no update for waiting or all files scheduled
                break

        if not self.rmsMonitoring:
            gMonitor.flush()

        if error:
            return S_ERROR(error)

        # # request done?
        if self.request.Status == "Done":
            # # update request to the RequestDB
            self.log.info("Updating request status:",
                          "%s" % self.request.Status)
            update = self.updateRequest()
            if not update["OK"]:
                self.log.error("Cannot update request status",
                               update["Message"])
                return update
            self.log.info("request is done", "%s" % self.request.RequestName)
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord({
                    "timestamp":
                    int(Time.toEpoch()),
                    "host":
                    Network.getFQDN(),
                    "objectType":
                    "Request",
                    "objectID":
                    getattr(self.request, "RequestID", 0),
                    "status":
                    "Successful",
                    "nbObject":
                    1,
                })
            else:
                gMonitor.addMark("RequestOK", 1)
            # # and there is a job waiting for it? finalize!
            if self.request.JobID:
                attempts = 0
                while True:
                    finalizeRequest = self.requestClient.finalizeRequest(
                        self.request.RequestID,
                        self.request.JobID  # pylint: disable=no-member
                    )
                    if not finalizeRequest["OK"]:
                        if not attempts:
                            self.log.error(
                                "unable to finalize request, will retry",
                                "ReqName %s:%s" % (self.request.RequestName,
                                                   finalizeRequest["Message"]),
                            )
                        self.log.debug("Waiting 10 seconds")
                        attempts += 1
                        if attempts == 10:
                            self.log.error("Giving up finalize request")
                            return S_ERROR("Could not finalize request")

                        time.sleep(10)

                    else:
                        self.log.info(
                            "request is finalized",
                            "ReqName %s %s" % (self.request.RequestName,
                                               (" after %d attempts" %
                                                attempts) if attempts else ""),
                        )
                        break

        # Commit all the data to the ES Backend
        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()
        # Request will be updated by the callBack method
        self.log.verbose("RequestTasks exiting",
                         "request %s" % self.request.Status)
        return S_OK(self.request)
Пример #17
0
    def finalizeRequest(self, requestID, jobID, useCertificates=True):
        """check request status and perform finalization if necessary
            update the request status and the corresponding job parameter

        :param self: self reference
        :param str requestID: request id
        :param int jobID: job id
        """

        stateServer = JobStateUpdateClient(useCertificates=useCertificates)

        # Checking if to update the job status - we should fail here, so it will be re-tried later
        # Checking the state, first
        res = self.getRequestStatus(requestID)
        if not res["OK"]:
            self.log.error(
                "finalizeRequest: failed to get request",
                "request: %s status: %s" % (requestID, res["Message"]))
            return res
        if res["Value"] != "Done":
            return S_ERROR(
                "The request %s isn't 'Done' but '%s', this should never happen, why are we here?"
                % (requestID, res["Value"]))

        # The request is 'Done', let's update the job status. If we fail, we should re-try later

        monitorServer = JobMonitoringClient(useCertificates=useCertificates)
        res = monitorServer.getJobSummary(int(jobID))
        if not res["OK"]:
            self.log.error("finalizeRequest: Failed to get job status",
                           "JobID: %d" % jobID)
            return res
        elif not res["Value"]:
            self.log.info(
                "finalizeRequest: job %d does not exist (anymore): finalizing"
                % jobID)
            return S_OK()
        else:
            jobStatus = res["Value"]["Status"]
            jobMinorStatus = res["Value"]["MinorStatus"]
            jobAppStatus = ""
            newJobStatus = ""
            if jobStatus == JobStatus.STALLED:
                # If job is stalled, find the previous status from the logging info
                res = monitorServer.getJobLoggingInfo(int(jobID))
                if not res["OK"]:
                    self.log.error(
                        "finalizeRequest: Failed to get job logging info",
                        "JobID: %d" % jobID)
                    return res
                # Check the last status was Stalled and get the one before
                if len(res["Value"]
                       ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED:
                    jobStatus, jobMinorStatus, jobAppStatus = res["Value"][
                        -2][:3]
                    newJobStatus = jobStatus

            # update the job pending request digest in any case since it is modified
            self.log.info(
                "finalizeRequest: Updating request digest for job %d" % jobID)

            digest = self.getDigest(requestID)
            if digest["OK"]:
                digest = digest["Value"]
                self.log.verbose(digest)
                res = stateServer.setJobParameter(jobID, "PendingRequest",
                                                  digest)
                if not res["OK"]:
                    self.log.info(
                        "finalizeRequest: Failed to set job %d parameter: %s" %
                        (jobID, res["Message"]))
                    return res
            else:
                self.log.error(
                    "finalizeRequest: Failed to get request digest for %s: %s"
                    % (requestID, digest["Message"]))
            if jobStatus == JobStatus.COMPLETED:
                # What to do? Depends on what we have in the minorStatus
                if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS:
                    newJobStatus = JobStatus.DONE
                elif jobMinorStatus == JobMinorStatus.APP_ERRORS:
                    newJobStatus = JobStatus.FAILED
                elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION:
                    # If the job has been Killed, set it Killed
                    newJobStatus = JobStatus.KILLED
                else:
                    self.log.error(
                        "finalizeRequest: Unexpected jobMinorStatus",
                        "for %d (got %s)" % (jobID, jobMinorStatus))
                    return S_ERROR("Unexpected jobMinorStatus")

            if newJobStatus:
                self.log.info(
                    "finalizeRequest: Updating job status",
                    "for %d to '%s/%s'" %
                    (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE),
                )
            else:
                self.log.info(
                    "finalizeRequest: Updating job minor status",
                    "for %d to '%s' (current status is %s)" %
                    (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus),
                )
            stateUpdate = stateServer.setJobStatus(
                jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS")
            if jobAppStatus and stateUpdate["OK"]:
                stateUpdate = stateServer.setJobApplicationStatus(
                    jobID, jobAppStatus, "RMS")
            if not stateUpdate["OK"]:
                self.log.error(
                    "finalizeRequest: Failed to set job status",
                    "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]),
                )
                return stateUpdate

        return S_OK(newJobStatus)
Пример #18
0
    jobs = []
    for arg in Script.getPositionalArgs():
        try:
            jobs += [int(job) for job in arg.split(',')]
        except ValueError:
            gLogger.fatal("Invalid list of jobIDs")
            DIRAC.exit(2)

    from DIRAC.DataManagementSystem.Client.DataManager import DataManager
    from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient
    from DIRAC.Core.Utilities.SiteSEMapping import getSEsForSite
    dm = DataManager()
    bk = BookkeepingClient()

    monitoring = JobMonitoringClient()

    if not jobs:
        conditions = {
            'Status': 'Failed',
            'MinorStatus': 'Maximum of reschedulings reached',
            'ApplicationStatus': 'Failed Input Data Resolution '
        }
        prStr = 'all jobs'
        if production:
            prStr = 'production %s' % ' '.join(production)
            if len(production) == 1:
                production = production[0]
            conditions['JobGroup'] = production
        if userName:
            prStr = 'user %s' % userName
Пример #19
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        jobIDs = []
        dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for dest in dests:
            for lfns in lfnss:
                for jobType in types:
                    job = helloWorldJob()
                    job.setDestination(dest)
                    job.setInputData(lfns)
                    job.setType(jobType)
                    jobDescription = createFile(job)
                    res = wmsClient.submitJob(
                        job._toJDL(xmlFile=jobDescription))
                    self.assert_(res['OK'])
                    jobID = res['Value']
                    jobIDs.append(jobID)

        res = jobMonitor.getSites()
        self.assert_(res['OK'])
        self.assert_(
            set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch']))
        res = jobMonitor.getJobTypes()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(types))
        res = jobMonitor.getApplicationStates()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(['Unknown']))

        res = jobMonitor.getOwners()
        self.assert_(res['OK'])
        res = jobMonitor.getOwnerGroup()
        self.assert_(res['OK'])
        res = jobMonitor.getProductionIds()
        self.assert_(res['OK'])
        res = jobMonitor.getJobGroups()
        self.assert_(res['OK'])
        res = jobMonitor.getStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(['Job accepted', 'matching'])])
        self.assert_(res['OK'])
        res = jobMonitor.getJobs()
        self.assert_(res['OK'])
        self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assert_( res['OK'] )
        res = jobMonitor.getCurrentJobCounters()
        self.assert_(res['OK'])
        try:
            self.assert_(
                res['Value'].get('Received') + res['Value'].get('Waiting') >=
                long(len(dests) * len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assert_(res['OK'])
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assert_(res['OK'])

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assert_(res['OK'])

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Пример #20
0
    def __init__(self, *args, **kwargs):
        AgentModule.__init__(self, *args, **kwargs)
        self.name = 'DataRecoveryAgent'
        self.enabled = False

        self.productionsToIgnore = self.am_getOption("ProductionsToIgnore", [])
        self.transformationTypes = self.am_getOption("TransformationTypes", [
            'MCReconstruction', 'MCSimulation', 'MCReconstruction_Overlay',
            'MCGeneration'
        ])
        self.transformationStatus = self.am_getOption("TransformationStatus",
                                                      ['Active', 'Completing'])
        self.shifterProxy = self.am_setOption('shifterProxy', 'DataManager')

        self.jobStatus = [
            'Failed', 'Done'
        ]  ##This needs to be both otherwise we cannot account for all cases

        self.jobMon = JobMonitoringClient()
        self.fcClient = FileCatalogClient()
        self.tClient = TransformationClient()
        self.reqClient = ReqClient()
        self.diracILC = DiracILC()
        self.inputFilesProcessed = set()
        self.todo = {'MCGeneration':
                     [ dict( Message="MCGeneration: OutputExists: Job 'Done'",
                             ShortMessage="MCGeneration: job 'Done' ",
                             Counter=0,
                             Check=lambda job: job.allFilesExist() and job.status=='Failed',
                             Actions=lambda job,tInfo: [ job.setJobDone(tInfo) ]
                           ),
                       dict( Message="MCGeneration: OutputMissing: Job 'Failed'",
                             ShortMessage="MCGeneration: job 'Failed' ",
                             Counter=0,
                             Check=lambda job: job.allFilesMissing() and job.status=='Done',
                             Actions=lambda job,tInfo: [ job.setJobFailed(tInfo) ]
                           ),
                       # dict( Message="MCGeneration, job 'Done': OutputExists: Task 'Done'",
                       #       ShortMessage="MCGeneration: job already 'Done' ",
                       #       Counter=0,
                       #       Check=lambda job: job.allFilesExist() and job.status=='Done',
                       #       Actions=lambda job,tInfo: [ tInfo._TransformationInfo__setTaskStatus(job, 'Done') ]
                       #     ),
                     ],
                     'OtherProductions':
                     [ \
                   ## should always be first!

                       dict( Message="One of many Successful: clean others",
                             ShortMessage="Other Tasks --> Keep",
                             Counter=0,
                             Check=lambda job: job.allFilesExist() and job.otherTasks and job.inputFile not in self.inputFilesProcessed,
                             Actions=lambda job,tInfo: [ self.inputFilesProcessed.add(job.inputFile), job.setJobDone(tInfo), job.setInputProcessed(tInfo) ]
                           ),
                       dict( Message="Other Task processed Input, no Output: Fail",
                             ShortMessage="Other Tasks --> Fail",
                             Counter=0,
                             Check=lambda job: job.inputFile in self.inputFilesProcessed and job.allFilesMissing() and job.status!='Failed',
                             Actions=lambda job,tInfo: [ job.setJobFailed(tInfo) ]
                           ),
                       dict( Message="Other Task processed Input: Fail and clean",
                             ShortMessage="Other Tasks --> Cleanup",
                             Counter=0,
                             Check=lambda job: job.inputFile in self.inputFilesProcessed and not job.allFilesMissing(),
                             Actions=lambda job,tInfo: [ job.setJobFailed(tInfo), job.cleanOutputs(tInfo) ]
                           ),
                       dict( Message="InputFile missing: mark job 'Failed', mark input 'Deleted', clean",
                             ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup",
                             Counter=0,
                             Check=lambda job: job.inputFile and not job.inputFileExists and job.fileStatus != "Deleted",
                             Actions=lambda job,tInfo: [ job.cleanOutputs(tInfo), job.setJobFailed(tInfo), job.setInputDeleted(tInfo) ]
                           ),
                       dict( Message="InputFile Deleted, output Exists: mark job 'Failed', clean",
                             ShortMessage="Input Deleted --> Job 'Failed, Cleanup",
                             Counter=0,
                             Check=lambda job: job.inputFile and not job.inputFileExists and job.fileStatus == "Deleted" and not job.allFilesMissing(),
                             Actions=lambda job,tInfo: [ job.cleanOutputs(tInfo), job.setJobFailed(tInfo) ]
                           ),
                       ## All Output Exists
                       dict( Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed",
                             ShortMessage="Output Exists --> Job Done, Input Processed",
                             Counter=0,
                             Check=lambda job: job.allFilesExist() and \
                                               not job.otherTasks and \
                                               job.status=='Failed' and \
                                               job.fileStatus!="Processed" and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [ job.setJobDone(tInfo), job.setInputProcessed(tInfo) ]
                           ),
                       dict( Message="Output Exists, job Failed, input Processed --> Job Done",
                             ShortMessage="Output Exists --> Job Done",
                             Counter=0,
                             Check=lambda job: job.allFilesExist() and \
                                               not job.otherTasks and \
                                               job.status=='Failed' and \
                                               job.fileStatus=="Processed" and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [ job.setJobDone(tInfo) ]
                           ),
                       dict( Message="Output Exists, job Done, input not Processed --> Input Processed",
                             ShortMessage="Output Exists --> Input Processed",
                             Counter=0,
                             Check=lambda job: job.allFilesExist() and \
                                               not job.otherTasks and \
                                               job.status=='Done' and \
                                               job.fileStatus!="Processed" and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [ job.setInputProcessed(tInfo) ]
                           ),
                       ## outputmissing
                       dict( Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset",
                             ShortMessage="Max ErrorCount --> Input MaxReset",
                             Counter=0,
                             Check=lambda job: job.allFilesMissing() and \
                                               not job.otherTasks and \
                                               job.status=='Failed' and \
                                               job.fileStatus in ASSIGNEDSTATES and \
                                               job.inputFile not in self.inputFilesProcessed and \
                                               job.inputFileExists and \
                                               job.errorCount > MAXRESET,
                             Actions=lambda job,tInfo: [ job.setInputMaxReset(tInfo) ]
                           ),
                       dict( Message="Output Missing, job Failed, input Assigned --> Input Unused",
                             ShortMessage="Output Missing --> Input Unused",
                             Counter=0,
                             Check=lambda job: job.allFilesMissing() and \
                                               not job.otherTasks and \
                                               job.status=='Failed' and \
                                               job.fileStatus in ASSIGNEDSTATES and \
                                               job.inputFile not in self.inputFilesProcessed and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [ job.setInputUnused(tInfo) ]
                           ),
                       dict( Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused",
                             ShortMessage="Output Missing --> Job Failed, Input Unused",
                             Counter=0,
                             Check=lambda job: job.allFilesMissing() and \
                                               not job.otherTasks and \
                                               job.status=='Done' and \
                                               job.fileStatus in ASSIGNEDSTATES and \
                                               job.inputFile not in self.inputFilesProcessed and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [ job.setInputUnused(tInfo), job.setJobFailed(tInfo) ]
                           ),
                       ## some files missing, needing cleanup. Only checking for
                       ## assigned, because processed could mean an earlier job was
                       ## succesful and this one is just the duplicate that needed
                       ## to be removed! But we check for other tasks earlier, so
                       ## this should not happen
                       dict( Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'",
                             ShortMessage="Output Missing --> Cleanup, Input Unused",
                             Counter=0,
                             Check=lambda job: job.someFilesMissing() and \
                                               not job.otherTasks and \
                                               job.status=='Failed' and \
                                               job.fileStatus in ASSIGNEDSTATES and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [job.cleanOutputs(tInfo),job.setInputUnused(tInfo)]
                             #Actions=lambda job,tInfo: []
                           ),
                       dict( Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'",
                             ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused",
                             Counter=0,
                             Check=lambda job: job.someFilesMissing() and \
                                               not job.otherTasks and \
                                               job.status=='Done' and \
                                               job.fileStatus in ASSIGNEDSTATES and \
                                               job.inputFileExists,
                             Actions=lambda job,tInfo: [job.cleanOutputs(tInfo),job.setInputUnused(tInfo),job.setJobFailed(tInfo)]
                             #Actions=lambda job,tInfo: []
                           ),
                       dict( Message="Some missing, job Done --> job Failed",
                             ShortMessage="Output Missing, Done --> Job Failed",
                             Counter=0,
                             Check=lambda job: not job.allFilesExist() and job.status=='Done',
                             Actions=lambda job,tInfo: [job.setJobFailed(tInfo)]
                           ),
                       dict ( Message="Something Strange",
                              ShortMessage="Strange",
                              Counter=0,
                              Check=lambda job: job.status not in ("Failed","Done"),
                              Actions=lambda job,tInfo: []
                            ),
                       ##should always be the last one!
                       dict ( Message="Failed Hard",
                              ShortMessage="Failed Hard",
                              Counter=0,
                              Check=lambda job: False, ## never
                              Actions=lambda job,tInfo: []
                            ),
                     ]
                    }
        self.jobCache = defaultdict(lambda: (0, 0))
        self.printEveryNJobs = self.am_getOption('PrintEvery', 200)
        ##Notification
        self.notesToSend = ""
        self.addressTo = self.am_getOption('MailTo',
                                           ["*****@*****.**"])
        self.addressFrom = self.am_getOption('MailFrom',
                                             "*****@*****.**")
        self.subject = "DataRecoveryAgent"
Пример #21
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(isinstance(res['Value'], int),
                        msg="Got %s" % type(res['Value']))
        self.assertEqual(res['Value'],
                         res['JobID'],
                         msg="Got %s, expected %s" %
                         (str(res['Value']), res['JobID']))
        jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        res = jobStateUpdate.setJobStatus(jobID, 'Running',
                                          'Executing Minchiapp', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Received',
                         msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsMinorStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'MinorStatus': 'Job Rescheduled',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsApplicationStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'ApplicationStatus': 'Unknown',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))

        # updating the status again
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Killed',
                         msg="Got %s" % str(res['Value']))

        # updating the status aaaagain
        res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'], 'Done', msg="Got %s" %
            str(res['Value']))  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Deleted',
                         msg="Got %s" % str(res['Value']))
Пример #22
0
    def __init__(self, *args, **kwargs):
        AgentModule.__init__(self, *args, **kwargs)
        self.name = 'DataRecoveryAgent'
        self.enabled = False
        self.getJobInfoFromJDLOnly = False

        self.__getCSOptions()

        self.jobStatus = [
            'Failed', 'Done'
        ]  # This needs to be both otherwise we cannot account for all cases

        self.jobMon = JobMonitoringClient()
        self.fcClient = FileCatalogClient()
        self.tClient = TransformationClient()
        self.reqClient = ReqClient()
        self.diracAPI = Dirac()
        self.inputFilesProcessed = set()
        self.todo = {'NoInputFiles':
                     [dict(Message="NoInputFiles: OutputExists: Job 'Done'",
                           ShortMessage="NoInputFiles: job 'Done' ",
                           Counter=0,
                           Check=lambda job: job.allFilesExist() and job.status == 'Failed',
                           Actions=lambda job, tInfo: [job.setJobDone(tInfo)],
                           ),
                      dict(Message="NoInputFiles: OutputMissing: Job 'Failed'",
                           ShortMessage="NoInputFiles: job 'Failed' ",
                           Counter=0,
                           Check=lambda job: job.allFilesMissing() and job.status == 'Done',
                           Actions=lambda job, tInfo: [job.setJobFailed(tInfo)],
                           ),
                      ],
                     'InputFiles':
                     [ \
                     # must always be first!

                         dict(Message="One of many Successful: clean others",
                              ShortMessage="Other Tasks --> Keep",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and job.otherTasks and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed),
                              Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles),
                                                          job.setJobDone(tInfo),
                                                          job.setInputProcessed(tInfo)]
                              ),
                         dict(Message="Other Task processed Input, no Output: Fail",
                              ShortMessage="Other Tasks --> Fail",
                              Counter=0,
                              Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allFilesMissing() and job.status != 'Failed',
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Other Task processed Input: Fail and clean",
                              ShortMessage="Other Tasks --> Cleanup",
                              Counter=0,
                              Check=lambda job: set(job.inputFiles).issubset(
                                  self.inputFilesProcessed) and not job.allFilesMissing(),
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)]
                              ),
                         dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean",
                              ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup",
                              Counter=0,
                              Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \
                              not job.allTransFilesDeleted(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo),
                                                          job.setInputDeleted(tInfo)],
                              ),
                         dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean",
                              ShortMessage="Input Deleted --> Job 'Failed, Cleanup",
                              Counter=0,
                              Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \
                              job.allTransFilesDeleted() and not job.allFilesMissing(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)],
                              ),
                         # All Output Exists
                         dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed",
                              ShortMessage="Output Exists --> Job Done, Input Processed",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              not job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)]
                              ),
                         dict(Message="Output Exists, job Failed, input Processed --> Job Done",
                              ShortMessage="Output Exists --> Job Done",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setJobDone(tInfo)]
                              ),
                         dict(Message="Output Exists, job Done, input not Processed --> Input Processed",
                              ShortMessage="Output Exists --> Input Processed",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              not job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)]
                              ),
                         # outputmissing
                         dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset",
                              ShortMessage="Max ErrorCount --> Input MaxReset",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist() and \
                              job.checkErrorCount(),
                              Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)]
                              ),
                         dict(Message="Output Missing, job Failed, input Assigned --> Input Unused",
                              ShortMessage="Output Missing --> Input Unused",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputUnused(tInfo)]
                              ),
                         dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused",
                              ShortMessage="Output Missing --> Job Failed, Input Unused",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)]
                              ),
                         # some files missing, needing cleanup. Only checking for
                         # assigned, because processed could mean an earlier job was
                         # succesful and this one is just the duplicate that needed
                         # to be removed! But we check for other tasks earlier, so
                         # this should not happen
                         dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'",
                              ShortMessage="Output Missing --> Cleanup, Input Unused",
                              Counter=0,
                              Check=lambda job: job.someFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)]
                              ),
                         dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'",
                              ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused",
                              Counter=0,
                              Check=lambda job: job.someFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              job.allFilesAssigned() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [
                                  job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Some missing, job Done --> job Failed",
                              ShortMessage="Output Missing, Done --> Job Failed",
                              Counter=0,
                              Check=lambda job: not job.allFilesExist() and job.status == 'Done',
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Something Strange",
                              ShortMessage="Strange",
                              Counter=0,
                              Check=lambda job: job.status not in ("Failed", "Done"),
                              Actions=lambda job, tInfo: []
                              ),
                         # should always be the last one!
                         dict(Message="Failed Hard",
                              ShortMessage="Failed Hard",
                              Counter=0,
                              Check=lambda job: False,  # never
                              Actions=lambda job, tInfo: []
                              ),
                     ]
                     }
        self.jobCache = defaultdict(lambda: (0, 0))
        # Notification options
        self.notesToSend = ""
        self.subject = "DataRecoveryAgent"
        self.startTime = time.time()
Пример #23
0
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
    """
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not result['OK'] or not result['Value']:
                self.log.error(
                    'Error getting Job Parameter CPUNormalizationFactor, setting 0',
                    result.get('Message', 'No such value'))
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result['Value'].get('CPUNormalizationFactor'))

        except Exception as e:
            self.log.exception(
                "Exception in __sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e)
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData['ExecTime']:
            acData['ExecTime'] = acData['CPUTime']
        elif acData['ExecTime'] < acData['CPUTime']:
            acData['ExecTime'] = acData['CPUTime']

        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result
import time

__RCSID__ = "$Id$"

from DIRAC.Core.Base import Script
Script.parseCommandLine()

from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient

# sut
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient

from DIRAC.tests.Integration.WorkloadManagementSystem.Test_Client_WMS import helloWorldJob, createFile

jobMonitoringClient = JobMonitoringClient()
jobStateUpdateClient = JobStateUpdateClient()


def createJob():

    job = helloWorldJob()
    jobDescription = createFile(job)

    wmsClient = WMSClient()
    res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    assert res['OK'], res['Message']
    jobID = int(res['Value'])
    return jobID

Пример #25
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination('DIRAC.Jenkins.ch')
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res['OK'], res.get('Message'))
                jobID = res['Value']
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'},
                        msg="Got %s" % res['Value'])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(types),
                         msg="Got %s" % str(sorted(res['Value'])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(['Unknown']),
                         msg="Got %s" % sorted(str(res['Value'])))

        res = jobMonitor.getOwners()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_empty = res['Value']
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanNow = res['Value']
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanOneYear = res['Value']
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)))
        res = jobMonitor.getStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(
                                         ['Job accepted', 'Job Rescheduled'])])
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobs()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getCurrentJobCounters()
        self.assertTrue(res['OK'], res.get('Message'))
        try:
            self.assertTrue(
                res['Value'].get('Received') +
                res['Value'].get('Waiting') >= int(len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assertTrue(res['OK'], res.get('Message'))

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Пример #26
0
    def _sendAccounting(self, jobID):
        """
        Send WMS accounting data for the given job.

        Run inside thread.
        """
        try:
            accountingReport = Job()
            endTime = "Unknown"
            lastHeartBeatTime = "Unknown"

            result = self.jobDB.getJobAttributes(jobID)
            if not result["OK"]:
                return result
            jobDict = result["Value"]

            startTime, endTime = self._checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self._checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, "CPUNormalizationFactor")
            if not result["OK"] or not result["Value"]:
                self.log.error(
                    "Error getting Job Parameter CPUNormalizationFactor, setting 0",
                    result.get("Message", "No such value"),
                )
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result["Value"].get("CPUNormalizationFactor"))

        except Exception as e:
            self.log.exception(
                "Exception in _sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e,
            )
            return S_ERROR("Exception")
        processingType = self._getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            "Site": jobDict["Site"],
            "User": jobDict["Owner"],
            "UserGroup": jobDict["OwnerGroup"],
            "JobGroup": jobDict["JobGroup"],
            "JobType": jobDict["JobType"],
            "JobClass": jobDict["JobSplitType"],
            "ProcessingType": processingType,
            "FinalMajorStatus": JobStatus.FAILED,
            "FinalMinorStatus": JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
            "CPUTime": lastCPUTime,
            "NormCPUTime": lastCPUTime * cpuNormalization,
            "ExecTime": lastWallTime,
            "InputDataSize": 0.0,
            "OutputDataSize": 0.0,
            "InputDataFiles": 0,
            "OutputDataFiles": 0,
            "DiskSpace": 0.0,
            "InputSandBoxSize": 0.0,
            "OutputSandBoxSize": 0.0,
            "ProcessedEvents": 0,
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData["ExecTime"]:
            acData["ExecTime"] = acData["CPUTime"]
        elif acData["ExecTime"] < acData["CPUTime"]:
            acData["ExecTime"] = acData["CPUTime"]

        self.log.verbose("Accounting Report is:")
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result["OK"]:
            self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
        else:
            self.log.error(
                "Failed to send accounting report",
                "Job: %d, Error: %s" % (int(jobID), result["Message"]))
        return result
Пример #27
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """ Verifying all JobStateUpdate and JobMonitoring functions
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        jobID = int(res['Value'])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assert_(res['OK'])
        res = jobMonitor.getJobJDL(jobID, False)
        self.assert_(res['OK'])
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {})
        res = jobMonitor.getJobsParameters([jobID], ['Owner'])
        self.assert_(res['OK'])

        # Adding stuff
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'),
                                                      ('par2', 'par2Value')])
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status',
                                                     'source')
        self.assert_(res['OK'])
        #     res = jobStateUpdate.setJobFlag()
        #     self.assert_( res['OK'] )
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assert_( res['OK'] )
        res = jobStateUpdate.setJobSite(jobID, 'Site')
        self.assert_(res['OK'])
        #     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
        #     self.assert_( res['OK'] )

        # now checking few things
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Running')
        res = jobMonitor.getJobParameter(jobID, 'par1')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {'par1': 'par1Value'})
        res = jobMonitor.getJobParameters(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {
            'par1': 'par1Value',
            'par2': 'par2Value'
        })
        res = jobMonitor.getJobAttribute(jobID, 'Site')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Site')
        res = jobMonitor.getJobAttributes(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['JobName'], 'helloWorld')
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['Status'], 'Running')
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getInputData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getJobPrimarySummary(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus',
                                           'Unknown')
        self.assert_(res['OK'])
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['Status'], 'Done')
        self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus')
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'},
                                           {'boh': 'boh'})
        self.assert_(res['OK'])

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)
Пример #28
0
 def __init__(self):
     TaskBase.__init__(self)
     self.submissionClient = WMSClient()
     self.jobMonitoringClient = JobMonitoringClient()
Пример #29
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        # self.assertEqual( type( res['Value'] ), int )
        # self.assertEqual( res['Value'], res['JobID'] )
        # jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp',
                                    'source')

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assert_(res['OK'])

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Received')

        # updating the status again
        jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Killed')

        # updating the status aaaagain
        jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'],
                         'Done')  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Deleted')
Пример #30
0
    def getProductionApplicationSummary(self,
                                        productionID,
                                        status=None,
                                        minorStatus=None,
                                        printOutput=False):
        """Returns an application status summary for the productions in the system. If printOutput is
       specified, the result is printed to the screen.  This queries the WMS
       for the given productionID and provides an up-to-date snapshot of the application status
       combinations and associated WMS JobIDs.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        statusDict = self.getProdJobMetadata(productionID, status, minorStatus)
        if not statusDict['OK']:
            self.log.warn('Could not get production metadata information')
            return statusDict

        jobIDs = list(statusDict['Value'])
        if not jobIDs:
            return S_ERROR('No JobIDs with matching conditions found')

        self.log.verbose('Considering %s jobs with selected conditions' %
                         (len(jobIDs)))
        # now need to get the application status information
        result = JobMonitoringClient().getJobsApplicationStatus(jobIDs)
        if not result['OK']:
            self.log.warn('Could not get application status for jobs list')
            return result

        appStatus = result['Value']
        #    self._prettyPrint(appStatus)
        #    self._prettyPrint(statusDict['Value'])
        # Now format the result.
        summary = {}
        submittedJobs = 0
        doneJobs = 0
        for job, atts in statusDict['Value'].iteritems():
            for key, val in atts.iteritems():
                if key == 'Status':
                    uniqueStatus = val.capitalize()
                    if uniqueStatus not in summary:
                        summary[uniqueStatus] = {}
                    if atts['MinorStatus'] not in summary[uniqueStatus]:
                        summary[uniqueStatus][atts['MinorStatus']] = {}
                    if appStatus[job]['ApplicationStatus'] not in summary[
                            uniqueStatus][atts['MinorStatus']]:
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']] = {}
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['Total'] = 1
                        submittedJobs += 1
                        if uniqueStatus == 'Done':
                            doneJobs += 1
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['JobList'] = [
                                job
                            ]
                    else:
                        if appStatus[job]['ApplicationStatus'] not in summary[
                                uniqueStatus][atts['MinorStatus']]:
                            summary[uniqueStatus][atts['MinorStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]['ApplicationStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = [job]
                        else:
                            current = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['Total']
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = current + 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            jobList = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['JobList']
                            jobList.append(job)
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = jobList

        if not printOutput:
            result = S_OK()
            if not status and not minorStatus:
                result['Totals'] = {
                    'Submitted': int(submittedJobs),
                    'Done': int(doneJobs)
                }
            result['Value'] = summary
            return result

        # If a printed summary is requested
        statAdj = int(0.5 * self.prodAdj)
        mStatAdj = int(2.0 * self.prodAdj)
        totalAdj = int(0.5 * self.prodAdj)
        exAdj = int(0.5 * self.prodAdj)
        message = '\nJob Summary for ProductionID %s considering status %s' % (
            productionID, status)
        if minorStatus:
            message += 'and MinorStatus = %s' % (minorStatus)

        message += ':\n\n'
        message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \
            'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n'
        for stat, metadata in summary.iteritems():
            message += '\n'
            for minor, appInfo in metadata.iteritems():
                message += '\n'
                for appStat, jobInfo in appInfo.iteritems():
                    message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \
                        str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n'

        # self._prettyPrint(summary)
        if status or minorStatus:
            return S_OK(summary)

        result = self.getProductionProgress(productionID)
        if not result['OK']:
            self.log.warn('Could not get production progress information')
            return result

        if 'Created' in result['Value']:
            createdJobs = int(result['Value']['Created']) + submittedJobs
        else:
            createdJobs = submittedJobs

        percSub = int(100 * submittedJobs / createdJobs)
        percDone = int(100 * doneJobs / createdJobs)
        print '\nCurrent status of production %s:\n' % productionID
        print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '%  ( ' + str(submittedJobs).ljust(7) + \
            'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        print 'Done'.ljust(12) + str(percDone).ljust(3) + '%  ( ' + str(doneJobs).ljust(7) + \
            'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        result = S_OK()
        result['Totals'] = {
            'Submitted': int(submittedJobs),
            'Created': int(createdJobs),
            'Done': int(doneJobs)
        }
        result['Value'] = summary
        # self.pPrint(result)
        return result