Exemplo n.º 1
0
    def _updateConfiguration(self, key, value, path="/LocalSite"):
        """Update local configuration to be used by submitted job wrappers"""
        localCfg = CFG()
        if self.extraOptions:
            localConfigFile = os.path.join(".", self.extraOptions)
        else:
            localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)

        section = "/"
        for p in path.split("/")[1:]:
            section = os.path.join(section, p)
            if not localCfg.isSection(section):
                localCfg.createNewSection(section)

        localCfg.setOption("%s/%s" % (section, key), value)
        localCfg.writeToFile(localConfigFile)
Exemplo n.º 2
0
class JobRepository(object):
    def __init__(self, repository=None):
        self.location = repository
        if not self.location:
            if "HOME" in os.environ:
                self.location = '%s/.dirac.repo.rep' % os.environ['HOME']
            else:
                self.location = '%s/.dirac.repo.rep' % os.getcwd()
        self.repo = CFG()
        if os.path.exists(self.location):
            self.repo.loadFromFile(self.location)
            if not self.repo.existsKey('Jobs'):
                self.repo.createNewSection('Jobs')
        else:
            self.repo.createNewSection('Jobs')
        self.OK = True
        written = self._writeRepository(self.location)
        if not written:
            self.OK = False

    def isOK(self):
        return self.OK

    def readRepository(self):
        return S_OK(self.repo.getAsDict('Jobs'))

    def writeRepository(self, alternativePath=None):
        destination = self.location
        if alternativePath:
            destination = alternativePath
        written = self._writeRepository(destination)
        if not written:
            return S_ERROR("Failed to write repository")
        return S_OK(destination)

    def resetRepository(self, jobIDs=[]):
        if not jobIDs:
            jobs = self.readRepository()['Value']
            jobIDs = list(jobs)
        paramDict = {'State': 'Submitted', 'Retrieved': 0, 'OutputData': 0}
        for jobID in jobIDs:
            self._writeJob(jobID, paramDict, True)
        self._writeRepository(self.location)
        return S_OK()

    def _writeRepository(self, path):
        handle, tmpName = tempfile.mkstemp()
        written = self.repo.writeToFile(tmpName)
        os.close(handle)
        if not written:
            if os.path.exists(tmpName):
                os.remove(tmpName)
            return written
        if os.path.exists(path):
            gLogger.debug("Replacing %s" % path)
        try:
            shutil.move(tmpName, path)
            return True
        except Exception as x:
            gLogger.error("Failed to overwrite repository.", x)
            gLogger.info(
                "If your repository is corrupted a backup can be found %s" %
                tmpName)
            return False

    def appendToRepository(self, repoLocation):
        if not os.path.exists(repoLocation):
            gLogger.error("Secondary repository does not exist", repoLocation)
            return S_ERROR("Secondary repository does not exist")
        self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo)
        self._writeRepository(self.location)
        return S_OK()

    def addJob(self,
               jobID,
               state='Submitted',
               retrieved=0,
               outputData=0,
               update=False):
        paramDict = {
            'State': state,
            'Time': self._getTime(),
            'Retrieved': int(retrieved),
            'OutputData': outputData
        }
        self._writeJob(jobID, paramDict, update)
        self._writeRepository(self.location)
        return S_OK(jobID)

    def updateJob(self, jobID, paramDict):
        if self._existsJob(jobID):
            paramDict['Time'] = self._getTime()
            self._writeJob(jobID, paramDict, True)
            self._writeRepository(self.location)
        return S_OK()

    def updateJobs(self, jobDict):
        for jobID, paramDict in jobDict.items():
            if self._existsJob(jobID):
                paramDict['Time'] = self._getTime()
                self._writeJob(jobID, paramDict, True)
        self._writeRepository(self.location)
        return S_OK()

    def _getTime(self):
        runtime = time.ctime()
        return runtime.replace(" ", "_")

    def _writeJob(self, jobID, paramDict, update):
        jobID = str(jobID)
        jobExists = self._existsJob(jobID)
        if jobExists and (not update):
            gLogger.warn("Job exists and not overwriting")
            return S_ERROR("Job exists and not overwriting")
        if not jobExists:
            self.repo.createNewSection('Jobs/%s' % jobID)
        for key, value in paramDict.items():
            self.repo.setOption('Jobs/%s/%s' % (jobID, key), value)
        return S_OK()

    def removeJob(self, jobID):
        res = self.repo['Jobs'].deleteKey(str(jobID))  # pylint: disable=no-member
        if res:
            self._writeRepository(self.location)
        return S_OK()

    def existsJob(self, jobID):
        return S_OK(self._existsJob(jobID))

    def _existsJob(self, jobID):
        return self.repo.isSection('Jobs/%s' % jobID)

    def getLocation(self):
        return S_OK(self.location)

    def getSize(self):
        return S_OK(len(self.repo.getAsDict('Jobs')))
Exemplo n.º 3
0
    elif os.path.isfile(
            os.path.expandvars("$WORKSPACE") +
            "/ServerInstallDIR/etc/dirac.cfg"):
        localConfigFile = os.path.expandvars(
            "$WORKSPACE") + "/ServerInstallDIR/etc/dirac.cfg"
    elif os.path.isfile("./etc/dirac.cfg"):
        localConfigFile = "./etc/dirac.cfg"
    else:
        print("Local CFG file not found")
        exit(2)

localCfg.loadFromFile(localConfigFile)
if not localCfg.isSection("/LocalSite"):
    localCfg.createNewSection("/LocalSite")
localCfg.setOption("/LocalSite/CPUTimeLeft", 5000)
localCfg.setOption("/DIRAC/Security/UseServerCertificate", False)

if not sMod:
    if not setup:
        setup = gConfig.getValue("/DIRAC/Setup")
        if not setup:
            setup = "dirac-JenkinsSetup"

    if not localCfg.isSection("/Operations"):
        localCfg.createNewSection("/Operations")
    if not localCfg.isSection("/Operations/%s" % setup):
        localCfg.createNewSection("/Operations/%s" % setup)
    localCfg.setOption("/Operations/%s/SoftwareDistModule" % setup, "")

localCfg.writeToFile(localConfigFile)
Exemplo n.º 4
0
    def execute(self):
        """The JobAgent execution method.
    """

        # Temporary mechanism to pass a shutdown message to the agent
        if os.path.exists('/var/lib/dirac_drain'):
            return self.__finish('Node is being drained by an operator')

        # Check if we can match jobs at all
        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              ': %d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            self.log.info(
                'CE is not available (and there are no running jobs)')
            return self.__finish('CE Not Available')

        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                self.timeLeft = self.computeCPUWorkLeft()
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        # if we are here we assume that a job can be matched
        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:

            # if we don't match a job, independently from the reason,
            # we wait a bit longer before trying again
            self.am_setOption("PollingTime",
                              int(self.am_getOption("PollingTime") * 1.5))

            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        # If we are here it is because we matched a job
        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                jobReport.setJobStatus(status='Failed',
                                       minor='Matcher did not return %s' %
                                       (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                jobReport.setJobStatus(status='Failed',
                                       minor='Matcher returned null %s' %
                                       (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            jobReport.setJobStatus(status='Failed',
                                   minor='Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            jobReport.setJobStatus(status='Failed', minor=msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions and 'dirac-jobexec' in params.get(
                'Executable', '').strip():
            params['Arguments'] = (params.get('Arguments', '') + ' ' +
                                   self.extraOptions).strip()
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport.setJobParameter(par_name='MatcherServiceTime',
                                      par_value=str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(par_name=thisp,
                                              par_value=gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus(status='Matched',
                                   minor='Job Received by Agent',
                                   sendFlag=False)
            result_setupProxy = self._setupProxy(ownerDN, jobGroup)
            if not result_setupProxy['OK']:
                return self._rescheduleFailedJob(jobID,
                                                 result_setupProxy['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result_setupProxy.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict,
                                                  jobReport)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result_submitJob = self._submitJob(
                jobID=jobID,
                jobParams=params,
                resourceParams=ceDict,
                optimizerParams=optimizerParams,
                proxyChain=proxyChain,
                jobReport=jobReport,
                processors=processors,
                wholeNode=wholeNode,
                maxNumberOfProcessors=maxNumberOfProcessors,
                mpTag=mpTag)

            # Committing the JobReport before evaluating the result of job submission
            res = jobReport.commit()
            if not res['OK']:
                resFD = jobReport.generateForwardDISET()
                if not resFD['OK']:
                    self.log.error("Error generating ForwardDISET operation",
                                   resFD['Message'])
                else:
                    # Here we create the Request.
                    op = resFD['Value']
                    request = Request()
                    requestName = 'jobAgent_%s' % jobID
                    request.RequestName = requestName.replace('"', '')
                    request.JobID = jobID
                    request.SourceComponent = "JobAgent_%s" % jobID
                    request.addOperation(op)
                    # This might fail, but only a message would be printed.
                    self._sendFailoverRequest(request)

            if not result_submitJob['OK']:
                return self.__finish(result_submitJob['Message'])
            elif 'PayloadFailed' in result_submitJob:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result_submitJob[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        return S_OK('Job Agent cycle complete')