Пример #1
0
 def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup,
                                  requiredTimeLeft):
     """
  To be overwritten if a given Pilot does not require a full proxy
 """
     return gProxyManager.getPilotProxyFromDIRACGroup(
         ownerDN, ownerGroup, requiredTimeLeft)
Пример #2
0
def getProxyFileForCloud(ce):
    """Get a file with the proxy to be used to connect to the
        given cloud endpoint

    :param ce: cloud endpoint object
    :return: S_OK/S_ERROR, value is the path to the proxy file
    """

    vo = ce.parameters.get("VO")
    cloudDN = None
    cloudGroup = None
    if vo:
        result = findGenericCloudCredentials(vo=vo)
        if not result["OK"]:
            return result
        cloudDN, cloudGroup = result["Value"]

    cloudUser = ce.parameters.get("GenericCloudUser")
    if cloudUser:
        result = Registry.getDNForUsername(cloudUser)
        if not result["OK"]:
            return result
        cloudDN = result["Value"][0]
    cloudGroup = ce.parameters.get("GenericCloudGroup", cloudGroup)

    if cloudDN and cloudGroup:
        result = gProxyManager.getPilotProxyFromDIRACGroup(
            cloudDN, cloudGroup, 3600)
        if not result["OK"]:
            return result
        proxy = result["Value"]
        result = gProxyManager.dumpProxyToFile(proxy)
        return result
    else:
        return S_ERROR("Could not find generic cloud credentials")
Пример #3
0
 def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup,
                                  requiredTimeLeft):
     """
  To be overwritten if a given Pilot does not require a full proxy
 """
     self.log.info("Downloading %s@%s proxy" % (ownerDN, ownerGroup))
     return gProxyManager.getPilotProxyFromDIRACGroup(
         ownerDN, ownerGroup, requiredTimeLeft)
Пример #4
0
  def getPilotProxy( self, userDN, userGroup, validity = 43200 ):
    """Retrieves a pilot proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getVOMSProxy()
         {'OK': True, 'Value': }

       @return: S_OK,S_ERROR

    """

    return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft = validity )
Пример #5
0
  def getPilotProxy( self, userDN, userGroup, validity = 43200 ):
    """Retrieves a pilot proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

       >>> print diracAdmin.getVOMSProxy()
       {'OK': True, 'Value': }

       @return: S_OK,S_ERROR

    """

    return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft = validity )
Пример #6
0
  def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True ):
    """ Prepare the full executable for queue
    """

    result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, 1000 )
    if not result['OK']:
      return result

    proxyString = result['Value']
    proxy = ''
    if bundleProxy:
      proxy = proxyString
    pilotOptions = self.__getPilotOptions( queue, pilotsToSubmit )
    if pilotOptions is None:
      return S_ERROR( 'Errors in compiling pilot options' )
    executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy )
    result = S_OK()
    result['Executable'] = executable
    result['Proxy'] = proxyString
    return result
Пример #7
0
    def __getExecutable(self, queue, pilotsToSubmit, bundleProxy=True):
        """ Prepare the full executable for queue
    """

        result = gProxyManager.getPilotProxyFromDIRACGroup(
            self.genericPilotDN, self.genericPilotGroup, 1000)
        if not result['OK']:
            return result

        proxyString = result['Value']
        proxy = ''
        if bundleProxy:
            proxy = proxyString
        pilotOptions = self.__getPilotOptions(queue, pilotsToSubmit)
        if pilotOptions is None:
            return S_ERROR('Errors in compiling pilot options')
        executable = self.__writePilotScript(self.workingDirectory,
                                             pilotOptions, proxy)
        result = S_OK()
        result['Executable'] = executable
        result['Proxy'] = proxyString
        return result
Пример #8
0
    def execute(self):
        """The JobAgent execution method."""
        self.log.verbose("Job Agent execution loop")

        queueDictItems = list(self.queueDict.items())
        random.shuffle(queueDictItems)

        # Check that there is enough slots locally
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"] or result["Value"]:
            return result

        for queueName, queueDictionary in queueDictItems:

            # Make sure there is no problem with the queue before trying to submit
            if not self._allowedToSubmit(queueName):
                continue

            # Get a working proxy
            ce = queueDictionary["CE"]
            cpuTime = 86400 * 3
            self.log.verbose(
                "Getting pilot proxy",
                "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            proxy = result["Value"]
            result = proxy.getRemainingSecs()  # pylint: disable=no-member
            if not result["OK"]:
                return result
            lifetime_secs = result["Value"]
            ce.setProxy(proxy, lifetime_secs)

            # Check that there is enough slots in the remote CE to match a job
            result = self._checkCEAvailability(ce)
            if not result["OK"] or result["Value"]:
                self.failedQueues[queueName] += 1
                continue

            # Get environment details and enhance them
            result = self._getCEDict(ce)
            if not result["OK"]:
                self.failedQueues[queueName] += 1
                continue
            ceDictList = result["Value"]

            for ceDict in ceDictList:
                # Information about number of processors might not be returned in CE.getCEStatus()
                ceDict["NumberOfProcessors"] = ce.ceParameters.get(
                    "NumberOfProcessors")
                self._setCEDict(ceDict)

            # Update the configuration with the names of the Site, CE and queue to target
            # This is used in the next stages
            self._updateConfiguration("Site", queueDictionary["Site"])
            self._updateConfiguration("GridCE", queueDictionary["CEName"])
            self._updateConfiguration("CEQueue", queueDictionary["QueueName"])
            self._updateConfiguration("RemoteExecution", True)

            # Try to match a job
            jobRequest = self._matchAJob(ceDictList)
            while jobRequest["OK"]:

                # Check matcher information returned
                matcherParams = ["JDL", "DN", "Group"]
                matcherInfo = jobRequest["Value"]
                jobID = matcherInfo["JobID"]
                jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName)
                result = self._checkMatcherInfo(matcherInfo, matcherParams,
                                                jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break

                jobJDL = matcherInfo["JDL"]
                jobGroup = matcherInfo["Group"]
                ownerDN = matcherInfo["DN"]
                ceDict = matcherInfo["CEDict"]
                matchTime = matcherInfo["matchTime"]

                optimizerParams = {}
                for key in matcherInfo:
                    if key not in matcherParams:
                        optimizerParams[key] = matcherInfo[key]

                # Get JDL paramters
                parameters = self._getJDLParameters(jobJDL)
                if not parameters["OK"]:
                    jobReport.setJobStatus(
                        status=JobStatus.FAILED,
                        minorStatus="Could Not Extract JDL Parameters")
                    self.log.warn("Could Not Extract JDL Parameters",
                                  parameters["Message"])
                    self.failedQueues[queueName] += 1
                    break

                params = parameters["Value"]
                result = self._extractValuesFromJobParams(params, jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break
                submissionParams = result["Value"]
                jobID = submissionParams["jobID"]
                jobType = submissionParams["jobType"]

                self.log.verbose("Job request successful: \n",
                                 jobRequest["Value"])
                self.log.info(
                    "Received",
                    "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" %
                    (jobID, jobType, ownerDN, jobGroup))
                try:
                    jobReport.setJobParameter(par_name="MatcherServiceTime",
                                              par_value=str(matchTime),
                                              sendFlag=False)
                    jobReport.setJobStatus(status=JobStatus.MATCHED,
                                           minorStatus="Job Received by Agent",
                                           sendFlag=False)

                    # Setup proxy
                    result_setupProxy = self._setupProxy(ownerDN, jobGroup)
                    if not result_setupProxy["OK"]:
                        result = self._rescheduleFailedJob(
                            jobID, result_setupProxy["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    proxyChain = result_setupProxy.get("Value")

                    # Check software and install them if required
                    software = self._checkInstallSoftware(
                        jobID, params, ceDict, jobReport)
                    if not software["OK"]:
                        self.log.error("Failed to install software for job",
                                       "%s" % (jobID))
                        errorMsg = software["Message"]
                        if not errorMsg:
                            errorMsg = "Failed software installation"
                        result = self._rescheduleFailedJob(jobID, errorMsg)
                        self.failedQueues[queueName] += 1
                        break

                    # Submit the job to the CE
                    self.log.debug("Before self._submitJob() (%sCE)" %
                                   (self.ceName))
                    result_submitJob = self._submitJob(
                        jobID=jobID,
                        jobParams=params,
                        resourceParams=ceDict,
                        optimizerParams=optimizerParams,
                        proxyChain=proxyChain,
                        jobReport=jobReport,
                        processors=submissionParams["processors"],
                        wholeNode=submissionParams["wholeNode"],
                        maxNumberOfProcessors=submissionParams[
                            "maxNumberOfProcessors"],
                        mpTag=submissionParams["mpTag"],
                    )

                    # Committing the JobReport before evaluating the result of job submission
                    res = jobReport.commit()
                    if not res["OK"]:
                        resFD = jobReport.generateForwardDISET()
                        if not resFD["OK"]:
                            self.log.error(
                                "Error generating ForwardDISET operation",
                                resFD["Message"])
                        elif resFD["Value"]:
                            # Here we create the Request.
                            op = resFD["Value"]
                            request = Request()
                            requestName = "jobAgent_%s" % jobID
                            request.RequestName = requestName.replace('"', "")
                            request.JobID = jobID
                            request.SourceComponent = "JobAgent_%s" % jobID
                            request.addOperation(op)
                            # This might fail, but only a message would be printed.
                            self._sendFailoverRequest(request)

                    if not result_submitJob["OK"]:
                        self.log.error("Error during submission",
                                       result_submitJob["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    elif "PayloadFailed" in result_submitJob:
                        # Do not keep running and do not overwrite the Payload error
                        message = "Payload execution failed with error code %s" % result_submitJob[
                            "PayloadFailed"]
                        self.log.info(message)

                    self.log.debug("After %sCE submitJob()" % (self.ceName))

                    # Check that there is enough slots locally
                    result = self._checkCEAvailability(self.computingElement)
                    if not result["OK"] or result["Value"]:
                        return result

                    # Check that there is enough slots in the remote CE to match a new job
                    result = self._checkCEAvailability(ce)
                    if not result["OK"] or result["Value"]:
                        self.failedQueues[queueName] += 1
                        break

                    # Try to match a new job
                    jobRequest = self._matchAJob(ceDictList)
                except Exception as subExcept:  # pylint: disable=broad-except
                    self.log.exception("Exception in submission",
                                       "",
                                       lException=subExcept,
                                       lExcInfo=True)
                    result = self._rescheduleFailedJob(
                        jobID, "Job processing failed with exception")
                    self.failedQueues[queueName] += 1
                    break

            if not jobRequest["OK"]:
                self._checkMatchingIssues(jobRequest)
                self.failedQueues[queueName] += 1
                continue

        return S_OK("Push Job Agent cycle complete")
Пример #9
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']

            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'Status': TRANSIENT_PILOT_STATUS,
                'OwnerDN': self.pilotDN,
                'OwnerGroup': self.pilotGroup
            })
            if not result['OK']:
                self.log.error('Failed to select pilots: %s' %
                               result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue

            #print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']

            #print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]['PilotStamp']:
                    stampedPilotRefs.append(pRef + ":::" +
                                            pilotDict[pRef]['PilotStamp'])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result['OK']:
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 600)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots status from CE',
                               '%s: %s' % (ceName, result['Message']))
                continue
            pilotCEDict = result['Value']

            #print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ''
                oldStatus = pilotDict[pRef]['Status']
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = 'Aborted'
                elif ceStatus != 'Unknown':
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info('Updating status to %s for pilot %s' %
                                  (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(
                        pRef, newStatus, '', 'Updated by SiteDirector')
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]['OutputReady'].lower(
                    ) == 'false' and self.getOutput:
                        self.log.info('Retrieving output for pilot %s' % pRef)
                        pilotStamp = pilotDict[pRef]['PilotStamp']
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ':::' + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result['OK']:
                            self.log.error(
                                'Failed to get pilot output',
                                '%s: %s' % (ceName, result['Message']))
                        else:
                            output, error = result['Value']
                            if output:
                                result = pilotAgentsDB.storePilotOutput(
                                    pRef, output, error)
                                if not result['OK']:
                                    self.log.error(
                                        'Failed to store pilot output',
                                        result['Message'])
                            else:
                                self.log.warn(
                                    'Empty pilot output not stored to PilotDB')

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 1000)
                if not result['OK']:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']
            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'OutputReady': 'False',
                'Status': FINAL_PILOT_STATUS
            })

            if not result['OK']:
                self.log.error('Failed to select pilots', result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info('Retrieving output for pilot %s' % pRef)
                    pilotStamp = pilotDict[pRef]['PilotStamp']
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ':::' + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result['OK']:
                        self.log.error('Failed to get pilot output',
                                       '%s: %s' % (ceName, result['Message']))
                    else:
                        output, error = result['Value']
                        result = pilotAgentsDB.storePilotOutput(
                            pRef, output, error)
                        if not result['OK']:
                            self.log.error('Failed to store pilot output',
                                           result['Message'])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots({
                    'DestinationSite':
                    ceName,
                    'Queue':
                    queueName,
                    'GridType':
                    ceType,
                    'GridSite':
                    siteName,
                    'AccountingSent':
                    'False',
                    'Status':
                    FINAL_PILOT_STATUS
                })

                if not result['OK']:
                    self.log.error('Failed to select pilots',
                                   result['Message'])
                    continue
                pilotRefs = result['Value']
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result['OK']:
                    self.log.error('Failed to get pilots info from DB',
                                   result['Message'])
                    continue
                pilotDict = result['Value']
                result = self.sendPilotAccounting(pilotDict)
                if not result['OK']:
                    self.log.error('Failed to send pilot agent accounting')

        return S_OK()
Пример #10
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: \n%s' %
                    (queue, result['Message']))
                continue
            ceInfoDict = result['CEInfoDict']
            self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                           ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                             ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

            totalSlots = result['Value']

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.info('No matching TQs found')
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                                    ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        return S_OK()
Пример #11
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'Status':TRANSIENT_PILOT_STATUS} )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      #print "AT >>> pilotRefs", pilotRefs

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info: %s' % result['Message'] )
        continue
      pilotDict = result['Value']

      #print "AT >>> pilotDict", pilotDict

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE: %s' % result['Message'] )
        continue
      pilotCEDict = result['Value']

      #print "AT >>> pilotCEDict", pilotCEDict

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        if oldStatus == ceStatus:
          # Status did not change, continue
          continue
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Pilot finished without reporting, consider it Aborted
          newStatus = 'Aborted'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output: %s' % result['Message'] )
            else:
              output, error = result['Value']
              result = pilotAgentsDB.storePilotOutput( pRef, output, error )
              if not result['OK']:
                self.log.error( 'Failed to store pilot output: %s' % result['Message'] )

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info: %s' % result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output: %s' % result['Message'] )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output: %s' % result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots: %s' % result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info: %s' % result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Пример #12
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]

            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "Status": TRANSIENT_PILOT_STATUS,
                    "OwnerDN": self.pilotDN,
                    "OwnerGroup": self.pilotGroup,
                }
            )
            if not result["OK"]:
                self.log.error("Failed to select pilots: %s" % result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue

            # print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]

            # print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]["PilotStamp"]:
                    stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result["OK"]:
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600)
                if not result["OK"]:
                    return result
                self.proxy = result["Value"]
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"]))
                continue
            pilotCEDict = result["Value"]

            # print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ""
                oldStatus = pilotDict[pRef]["Status"]
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = "Aborted"
                elif ceStatus != "Unknown":
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector")
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput:
                        self.log.info("Retrieving output for pilot %s" % pRef)
                        pilotStamp = pilotDict[pRef]["PilotStamp"]
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ":::" + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result["OK"]:
                            self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                        else:
                            output, error = result["Value"]
                            if output:
                                result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                                if not result["OK"]:
                                    self.log.error("Failed to store pilot output", result["Message"])
                            else:
                                self.log.warn("Empty pilot output not stored to PilotDB")

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000)
                if not result["OK"]:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]
            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "OutputReady": "False",
                    "Status": FINAL_PILOT_STATUS,
                }
            )

            if not result["OK"]:
                self.log.error("Failed to select pilots", result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info("Retrieving output for pilot %s" % pRef)
                    pilotStamp = pilotDict[pRef]["PilotStamp"]
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ":::" + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result["OK"]:
                        self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                    else:
                        output, error = result["Value"]
                        result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                        if not result["OK"]:
                            self.log.error("Failed to store pilot output", result["Message"])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots(
                    {
                        "DestinationSite": ceName,
                        "Queue": queueName,
                        "GridType": ceType,
                        "GridSite": siteName,
                        "AccountingSent": "False",
                        "Status": FINAL_PILOT_STATUS,
                    }
                )

                if not result["OK"]:
                    self.log.error("Failed to select pilots", result["Message"])
                    continue
                pilotRefs = result["Value"]
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result["OK"]:
                    self.log.error("Failed to get pilots info from DB", result["Message"])
                    continue
                pilotDict = result["Value"]
                result = self.sendPilotAccounting(pilotDict)
                if not result["OK"]:
                    self.log.error("Failed to send pilot agent accounting")

        return S_OK()
Пример #13
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                return S_ERROR('CPU time limit is not specified for queue %s' %
                               queue)
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.genericPilotDN, self.genericPilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: %s' %
                    (queue, result['Message']))
                continue

            totalSlots = result['Value']

            self.log.verbose(result['Message'])

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']

            result = taskQueueDB.getMatchingTaskQueues(ceDict)

            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found')
                continue

            totalTQJobs = 0
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)
            self.log.verbose(
                'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' %
                (totalSlots, totalTQJobs, pilotsToSubmit))

            if pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy)
                if not result['OK']:
                    return result

                executable = result['Value']
                result = ce.submitJob(executable, '', pilotsToSubmit)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:' % queue,
                                   result['Message'])
                    continue
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.genericPilotDN,
                        self.genericPilotGroup, self.localhost, ceType, '',
                        stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: %s' %
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfuly submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: %s' %
                                           result['Message'])
                            continue

        return S_OK()
Пример #14
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()
Пример #15
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools}
        if self.vo:
            tqDict["Community"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result["Value"]

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            ceType = self.queueDict[queue]["CEType"]
            queueName = self.queueDict[queue]["QueueName"]
            siteName = self.queueDict[queue]["Site"]
            siteMask = siteName in siteMaskList

            if "CPUTime" in self.queueDict[queue]["ParametersDict"]:
                queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            self.proxy = result["Value"]
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result["OK"]:
                self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"]))
                continue
            ceInfoDict = result["CEInfoDict"]
            self.log.info(
                "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d"
                % (
                    ceName,
                    queueName,
                    ceInfoDict["WaitingJobs"],
                    ceInfoDict["RunningJobs"],
                    ceInfoDict["SubmittedJobs"],
                    ceInfoDict["MaxTotalJobs"],
                )
            )

            totalSlots = result["Value"]

            ceDict = ce.getParameterDict()
            ceDict["GridCE"] = ceName
            if not siteMask and "Site" in ceDict:
                self.log.info("Site not in the mask %s" % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict["Site"]
            if self.vo:
                ceDict["Community"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            # This is a hack to get rid of !
            ceDict["SubmitPool"] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.info("No matching TQs found")
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime
                )
                if not result["OK"]:
                    self.log.error("Failed to get Number of Waiting pilots", result["Message"])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result["Value"]
                    self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots)

            pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info(
                "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d"
                % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit)
            )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get("BundleProxy", False)
                jobExecDir = ""
                if ceType == "CREAM":
                    jobExecDir = "."
                jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir)
                httpProxy = self.queueDict[queue].get("HttpProxy", "")

                result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir)
                if not result["OK"]:
                    return result

                executable, pilotSubmissionChunk = result["Value"]
                result = ce.submitJob(executable, "", pilotSubmissionChunk)
                os.unlink(executable)
                if not result["OK"]:
                    self.log.error("Failed submission to queue %s:\n" % queue, result["Message"])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result["Value"]
                self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key("PilotStampDict"):
                    stampDict = result["PilotStampDict"]
                tqPriorityList = []
                sumPriority = 0.0
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]["Priority"]
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict
                    )
                    if not result["OK"]:
                        self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot,
                            "Submitted",
                            ceName,
                            "Successfully submitted by the SiteDirector",
                            siteName,
                            queueName,
                        )
                        if not result["OK"]:
                            self.log.error("Failed to set pilot status: ", result["Message"])
                            continue

        return S_OK()
Пример #16
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tqDict['Tag'] = []
    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add( site )
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add( site )
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                          'Status': WAITING_PILOT_STATUS },
                                           None )
    totalWaitingPilots = 0
    if result['OK']:
      totalWaitingPilots = result['Value']
    self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) )
    #if totalWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle( queues )
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) )
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      siteMask = siteName in siteMaskList

      if not anySite and siteName not in jobSites:
        self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) )
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) )
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      #if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools
      
      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found for %s' % queue )
        continue

      matchedQueues += 1
      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) )

      # Get the number of already waiting pilots for these task queues
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                              None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )
      if totalWaitingPilots >= totalTQJobs:
        self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots )
        continue

      self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      totalSlots = self.__getQueueSlots( queue )
      if totalSlots == 0:
        self.log.debug( '%s: No slots available' % queue )
        continue

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                              ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          self.failedQueues[queue] += 1
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.queueSlots[queue]['AvailableSlots'] -= len( pilotList )
        totalSubmittedPilots += len( pilotList )
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random() * sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                      tqID,
                                                      self.pilotDN,
                                                      self.pilotGroup,
                                                      self.localhost,
                                                      ceType,
                                                      '',
                                                      stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) )
    return S_OK()
Пример #17
0
 def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
   """
    To be overwritten if a given Pilot does not require a full proxy
   """
   return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )
Пример #18
0
 def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
   """
    To be overwritten if a given Pilot does not require a full proxy
   """
   self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) )
   return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )
Пример #19
0
  def execute( self ):
    """The PilotAgent execution method.
    """

    self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 )
    self.gridEnv = self.am_getOption( 'GridEnv' )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )
    result = self.pilotDB._getConnection()
    if result['OK']:
      connection = result['Value']
    else:
      return result

    result = self.pilotDB.getPilotGroups( self.identityFieldsList,
                                         {'Status': self.queryStateList } )
    if not result['OK']:
      self.log.error( 'Fail to get identities Groups', result['Message'] )
      return result
    if not result['Value']:
      return S_OK()

    pilotsToAccount = {}

    for ownerDN, ownerGroup, gridType, broker in result['Value']:

      if not gridType in self.eligibleGridTypes:
        continue

      self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) )

      condDict1 = {'Status':'Done',
                   'StatusReason':'Report from JobAgent',
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      condDict2 = {'Status':self.queryStateList,
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      for condDict in [ condDict1, condDict2]:
        result = self.clearWaitingPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to clear Waiting Pilot Jobs' )

        result = self.pilotDB.selectPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to get the Pilot Agents' )
          return result
        if not result['Value']:
          continue
        refList = result['Value']

        ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup )
        if not ret['OK']:
          self.log.error( ret['Message'] )
          self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) )
          continue
        proxy = ret['Value']

        self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ),
                                                                                      ownerDN, ownerGroup ) )

        for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ):
          refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ]
          self.log.verbose( 'Querying %d pilots of %s starting at %d' %
                            ( len( refsToQuery ), len( refList ), start_index ) )
          result = self.getPilotStatus( proxy, gridType, refsToQuery )
          if not result['OK']:
            if result['Message'] == 'Broker not Available':
              self.log.error( 'Broker %s not Available' % broker )
              break
            self.log.warn( 'Failed to get pilot status:' )
            self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) )
            continue

          statusDict = result[ 'Value' ]
          for pRef in statusDict:
            pDict = statusDict[ pRef ]
            if pDict:
              if pDict['isParent']:
                self.log.verbose( 'Clear parametric parent %s' % pRef )
                result = self.clearParentJob( pRef, pDict, connection )
                if not result['OK']:
                  self.log.warn( result['Message'] )
                else:
                  self.log.info( 'Parametric parent removed: %s' % pRef )
              if pDict[ 'FinalStatus' ]:
                self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) )
                pilotsToAccount[ pRef ] = pDict
              else:
                self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) )
                result = self.pilotDB.setPilotStatus( pRef,
                                                      pDict['Status'],
                                                      pDict['DestinationSite'],
                                                      updateTime = pDict['StatusDate'],
                                                      conn = connection )

          if len( pilotsToAccount ) > 100:
            self.accountPilots( pilotsToAccount, connection )
            pilotsToAccount = {}

    self.accountPilots( pilotsToAccount, connection )
    # Now handle pilots not updated in the last N days (most likely the Broker is no 
    # longer available) and declare them Deleted.
    result = self.handleOldPilots( connection )

    connection.close()

    return S_OK()
Пример #20
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      # This proxy is used for checking the pilot status and renewals
      # We really need at least a few hours otherwise the renewed
      # proxy may expire before we check again...
      result = ce.isProxyValid( 3*3600 )
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'OutputReady':'False',
                                            'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'GridType':ceType,
                                              'GridSite':siteName,
                                              'AccountingSent':'False',
                                              'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Пример #21
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        images = self.imageDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['VO'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for image in images:
            if 'Tags' in self.imageDict[image]['ParametersDict']:
                tags += self.imageDict[image]['ParametersDict']['Tags']
        tqDict['Tag'] = list(set(tags))
        tqDict['SubmitPool'] = "mpdPool"

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        images = self.imageDict.keys()
        random.shuffle(images)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for image in images:

            # Check if the image failed previously
            #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor
            #if failedCount != 0:
            #  self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) )
            #  self.failedImages[image] += 1
            #  continue

            #print "AT >>> image parameters:", image
            #for key,value in self.imageDict[image].items():
            #  print key,value

            ce = self.imageDict[image]['CE']
            ceName = self.imageDict[image]['CEName']
            imageName = self.imageDict[image]['ImageName']
            siteName = self.imageDict[image]['Site']
            platform = self.imageDict[image]['Platform']
            imageTags = self.imageDict[image]['ParametersDict'].get('Tags', [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.imageDict[image]['MaxInstances'])
            processorTags = []

            for tag in imageTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            # vms support WholeNode naturally
            processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (imageName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (imageName, siteName))
                continue

            if 'CPUTime' in self.imageDict[image]['ParametersDict']:
                imageCPUTime = int(
                    self.imageDict[image]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % image)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = processorTags

            # Get the number of eligible jobs for the target site/queue

            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % image)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), image))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, image))

            # Get the working proxy
            self.log.verbose("Getting cloud proxy for %s/%s" %
                             (self.cloudDN, self.cloudGroup))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.cloudDN, self.cloudGroup, 3600)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            #ce.setProxy( self.proxy, cpuTime - 60 )

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % image)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \
                                    ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) )

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, image))
            result = ce.createInstances(vmsToSubmit)
            #result = S_OK()
            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % image,
                               result['Message'])
                self.failedImages.setdefault(image, 0)
                self.failedImages[image] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), imageName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, imageName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfCPUs']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if not tqDict.has_key(tqID):
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB')

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Пример #22
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tags']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            queueTags = self.queueDict[queue]['ParametersDict']['Tags']
            siteMask = siteName in siteMaskList
            processorTags = []

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = processorTags
            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors.keys():

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    lastUpdateTime = dateTime(
                    ) - self.pilotWaitingTime * second
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None, lastUpdateTime)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                               ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)
                    httpProxy = self.queueDict[queue]['ParametersDict'].get(
                        'HttpProxy', '')

                    result = self.getExecutable(queue, pilotsToSubmit,
                                                bundleProxy, httpProxy,
                                                jobExecDir, processors)
                    if not result['OK']:
                        return result

                    executable, pilotSubmissionChunk = result['Value']
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if result.has_key('PilotStampDict'):
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if not tqDict.has_key(tqID):
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, '', stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Пример #23
0
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
        self.gridEnv = self.am_getOption('GridEnv')
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if setup:
                instance = gConfig.getValue(
                    '/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
                if instance:
                    self.gridEnv = gConfig.getValue(
                        '/Systems/WorkloadManagement/%s/GridEnv' % instance,
                        '')
        result = self.pilotDB._getConnection()
        if result['OK']:
            connection = result['Value']
        else:
            return result

        result = self.pilotDB.getPilotGroups(self.identityFieldsList,
                                             {'Status': self.queryStateList})
        if not result['OK']:
            self.log.error('Fail to get identities Groups', result['Message'])
            return result
        if not result['Value']:
            return S_OK()

        pilotsToAccount = {}

        for ownerDN, ownerGroup, gridType, broker in result['Value']:

            if not gridType in self.eligibleGridTypes:
                continue

            self.log.verbose('Getting pilots for %s:%s @ %s %s' %
                             (ownerDN, ownerGroup, gridType, broker))

            condDict1 = {
                'Status': 'Done',
                'StatusReason': 'Report from JobAgent',
                'OwnerDN': ownerDN,
                'OwnerGroup': ownerGroup,
                'GridType': gridType,
                'Broker': broker
            }

            condDict2 = {
                'Status': self.queryStateList,
                'OwnerDN': ownerDN,
                'OwnerGroup': ownerGroup,
                'GridType': gridType,
                'Broker': broker
            }

            for condDict in [condDict1, condDict2]:
                result = self.clearWaitingPilots(condDict)
                if not result['OK']:
                    self.log.warn('Failed to clear Waiting Pilot Jobs')

                result = self.pilotDB.selectPilots(condDict)
                if not result['OK']:
                    self.log.warn('Failed to get the Pilot Agents')
                    return result
                if not result['Value']:
                    continue
                refList = result['Value']

                ret = gProxyManager.getPilotProxyFromDIRACGroup(
                    ownerDN, ownerGroup)
                if not ret['OK']:
                    self.log.error(ret['Message'])
                    self.log.error(
                        'Could not get proxy:',
                        'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                    continue
                proxy = ret['Value']

                self.log.verbose(
                    "Getting status for %s pilots for owner %s and group %s" %
                    (len(refList), ownerDN, ownerGroup))

                for start_index in range(0, len(refList), MAX_JOBS_QUERY):
                    refsToQuery = refList[start_index:start_index +
                                          MAX_JOBS_QUERY]
                    self.log.verbose(
                        'Querying %d pilots of %s starting at %d' %
                        (len(refsToQuery), len(refList), start_index))
                    result = self.getPilotStatus(proxy, gridType, refsToQuery)
                    if not result['OK']:
                        if result['Message'] == 'Broker not Available':
                            self.log.error('Broker %s not Available' % broker)
                            break
                        self.log.warn('Failed to get pilot status:')
                        self.log.warn('%s:%s @ %s' %
                                      (ownerDN, ownerGroup, gridType))
                        continue

                    statusDict = result['Value']
                    for pRef in statusDict:
                        pDict = statusDict[pRef]
                        if pDict:
                            if pDict['isParent']:
                                self.log.verbose('Clear parametric parent %s' %
                                                 pRef)
                                result = self.clearParentJob(
                                    pRef, pDict, connection)
                                if not result['OK']:
                                    self.log.warn(result['Message'])
                                else:
                                    self.log.info(
                                        'Parametric parent removed: %s' % pRef)
                            if pDict['FinalStatus']:
                                self.log.verbose(
                                    'Marking Status for %s to %s' %
                                    (pRef, pDict['Status']))
                                pilotsToAccount[pRef] = pDict
                            else:
                                self.log.verbose(
                                    'Setting Status for %s to %s' %
                                    (pRef, pDict['Status']))
                                result = self.pilotDB.setPilotStatus(
                                    pRef,
                                    pDict['Status'],
                                    pDict['DestinationSite'],
                                    updateTime=pDict['StatusDate'],
                                    conn=connection)

                    if len(pilotsToAccount) > 100:
                        self.accountPilots(pilotsToAccount, connection)
                        pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        return S_OK()
Пример #24
0
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3)
        self.gridEnv = self.am_getOption("GridEnv")
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue("/DIRAC/Setup", "")
            if setup:
                instance = gConfig.getValue("/DIRAC/Setups/%s/WorkloadManagement" % setup, "")
                if instance:
                    self.gridEnv = gConfig.getValue("/Systems/WorkloadManagement/%s/GridEnv" % instance, "")
        result = self.pilotDB._getConnection()
        if result["OK"]:
            connection = result["Value"]
        else:
            return result

        result = self.pilotDB.getPilotGroups(self.identityFieldsList, {"Status": self.queryStateList})
        if not result["OK"]:
            self.log.error("Fail to get identities Groups", result["Message"])
            return result
        if not result["Value"]:
            return S_OK()

        pilotsToAccount = {}

        for ownerDN, ownerGroup, gridType, broker in result["Value"]:

            if not gridType in self.eligibleGridTypes:
                continue

            self.log.verbose("Getting pilots for %s:%s @ %s %s" % (ownerDN, ownerGroup, gridType, broker))

            condDict1 = {
                "Status": "Done",
                "StatusReason": "Report from JobAgent",
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            condDict2 = {
                "Status": self.queryStateList,
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            for condDict in [condDict1, condDict2]:
                result = self.clearWaitingPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to clear Waiting Pilot Jobs")

                result = self.pilotDB.selectPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to get the Pilot Agents")
                    return result
                if not result["Value"]:
                    continue
                refList = result["Value"]

                ret = gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup)
                if not ret["OK"]:
                    self.log.error(ret["Message"])
                    self.log.error("Could not get proxy:", 'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                    continue
                proxy = ret["Value"]

                self.log.verbose(
                    "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup)
                )

                for start_index in range(0, len(refList), MAX_JOBS_QUERY):
                    refsToQuery = refList[start_index : start_index + MAX_JOBS_QUERY]
                    self.log.verbose(
                        "Querying %d pilots of %s starting at %d" % (len(refsToQuery), len(refList), start_index)
                    )
                    result = self.getPilotStatus(proxy, gridType, refsToQuery)
                    if not result["OK"]:
                        if result["Message"] == "Broker not Available":
                            self.log.error("Broker %s not Available" % broker)
                            break
                        self.log.warn("Failed to get pilot status:")
                        self.log.warn("%s:%s @ %s" % (ownerDN, ownerGroup, gridType))
                        continue

                    statusDict = result["Value"]
                    for pRef in statusDict:
                        pDict = statusDict[pRef]
                        if pDict:
                            if pDict["isParent"]:
                                self.log.verbose("Clear parametric parent %s" % pRef)
                                result = self.clearParentJob(pRef, pDict, connection)
                                if not result["OK"]:
                                    self.log.warn(result["Message"])
                                else:
                                    self.log.info("Parametric parent removed: %s" % pRef)
                            if pDict["FinalStatus"]:
                                self.log.verbose("Marking Status for %s to %s" % (pRef, pDict["Status"]))
                                pilotsToAccount[pRef] = pDict
                            else:
                                self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"]))
                                result = self.pilotDB.setPilotStatus(
                                    pRef,
                                    pDict["Status"],
                                    pDict["DestinationSite"],
                                    updateTime=pDict["StatusDate"],
                                    conn=connection,
                                )

                    if len(pilotsToAccount) > 100:
                        self.accountPilots(pilotsToAccount, connection)
                        pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        return S_OK()
Пример #25
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = siteName in siteMaskList

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        return S_ERROR( 'CPU time limit is not specified for queue %s' % queue )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: %s' % ( queue, result['Message'] ) )
        continue

      totalSlots = result['Value']

      self.log.verbose( result['Message'] )

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]

      result = taskQueueDB.getMatchingTaskQueues( ceDict )

      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )
      self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % ( totalSlots, totalTQJobs, pilotsToSubmit ) )

      if pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy )
        if not result['OK']:
          return result

        # If proxy is not bundled in, submit with the user proxy 

        executable = result['Executable']
        proxy = result['Proxy']
        result = ce.submitJob( executable, proxy, pilotsToSubmit )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:' % queue, result['Message'] )

        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.genericPilotDN,
                                                     self.genericPilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfuly submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: %s' % result['Message'] )
              continue

    return S_OK()
Пример #26
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1 

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Пример #27
0
  def submitJobs(self):
    """ Go through defined computing elements and submit jobs if necessary
    """

    queues = self.queueDict.keys()

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = {'Setup': setup,
              'CPUTime': 9999999,
              'SubmitPool': self.defaultSubmitPools}
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    if self.checkPlatform:
      result = self.resourcesModule.getCompatiblePlatforms(self.platforms)
      if not result['OK']:
        return result
      tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tags = []
    for queue in queues:
      tags += self.queueDict[queue]['ParametersDict']['Tag']
    tqDict['Tag'] = list(set(tags))

    self.log.verbose('Checking overall TQ availability with requirements')
    self.log.verbose(tqDict)

    matcherClient = MatcherClient()
    result = matcherClient.getMatchingTaskQueues(tqDict)
    if not result['OK']:
      return result
    if not result['Value']:
      self.log.verbose('No Waiting jobs suitable for the director')
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add(site)
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add(site)
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    self.log.info(tqIDList)
    result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList,
                                        'Status': WAITING_PILOT_STATUS},
                                       None)
    tagWaitingPilots = 0
    if result['OK']:
      tagWaitingPilots = result['Value']
    self.log.info('Total %d jobs in %d task queues with %d waiting pilots' %
                  (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
    self.log.info('Queues: ', self.queueDict.keys())
    # if tagWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    result = self.siteClient.getUsableSites()
    if not result['OK']:
      return result
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle(queues)
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount))
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      queueTags = self.queueDict[queue]['ParametersDict']['Tag']
      siteMask = siteName in siteMaskList
      processorTags = []

      # Check the status of the Site
      result = self.siteClient.getUsableSites(siteName)
      if not result['OK']:
        self.log.error("Can not get the status of site %s: %s" %
                       (siteName, result['Message']))
        continue
      if siteName not in result.get('Value', []):
        self.log.info("site %s is not active" % siteName)
        continue

      if self.rssFlag:
        # Check the status of the ComputingElement
        result = self.rssClient.getElementStatus(ceName, "ComputingElement")
        if not result['OK']:
          self.log.error("Can not get the status of computing element",
                         " %s: %s" % (siteName, result['Message']))
          continue
        if result['Value']:
          # get the value of the status
          result = result['Value'][ceName]['all']

        if result not in ('Active', 'Degraded'):
          self.log.verbose(
              "Skipping computing element %s at %s: resource not usable" % (ceName, siteName))
          continue

      for tag in queueTags:
        if re.match(r'^[0-9]+Processors$', tag):
          processorTags.append(tag)
      if 'WholeNode' in queueTags:
        processorTags.append('WholeNode')

      if not anySite and siteName not in jobSites:
        self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName))
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName))
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
        queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime'])
      else:
        self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue)
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict['GridCE'] = ceName
      # if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      if self.checkPlatform:
        platform = self.queueDict[queue]['Platform']
        result = self.resourcesModule.getCompatiblePlatforms(platform)
        if not result['OK']:
          continue
        ceDict['Platform'] = result['Value']

      ceDict['Tag'] = queueTags
      # Get the number of eligible jobs for the target site/queue
      result = matcherClient.getMatchingTaskQueues(ceDict)
      if not result['OK']:
        self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message'])
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose('No matching TQs found for %s' % queue)
        continue

      matchedQueues += 1
      totalTQJobs = 0
      totalTQJobsByProcessors = {}
      tqIDList = taskQueueDict.keys()
      tqIDListByProcessors = {}
      for tq in taskQueueDict:
        if 'Tags' not in taskQueueDict[tq]:
          # skip non multiprocessor tqs
          continue
        for tag in taskQueueDict[tq]['Tags']:
          if tag in processorTags:
            tqIDListByProcessors.setdefault(tag, [])
            tqIDListByProcessors[tag].append(tq)

            totalTQJobsByProcessors.setdefault(tag, 0)
            totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs']

        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs,
                                                                                      len(tqIDList), queue))

      queueSubmittedPilots = 0
      for tag in tqIDListByProcessors:

        self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag]))

        processors = 1

        m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
        if m:
          processors = int(m.group('processors'))
        if tag == 'WholeNode':
          processors = -1

        tagTQJobs = totalTQJobsByProcessors[tag]
        tagTqIDList = tqIDListByProcessors[tag]

        # Get the number of already waiting pilots for these task queues
        tagWaitingPilots = 0
        if self.pilotWaitingFlag:
          result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList,
                                              'Status': WAITING_PILOT_STATUS},
                                             None)
          if not result['OK']:
            self.log.error('Failed to get Number of Waiting pilots', result['Message'])
            tagWaitingPilots = 0
          else:
            tagWaitingPilots = result['Value']
            self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots)
        if tagWaitingPilots >= tagTQJobs:
          self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots)
          continue

        self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots,
                                                                                         tagTQJobs, queue))

        # Get the working proxy
        cpuTime = queueCPUTime + 86400
        self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
        result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy(self.proxy, cpuTime - 60)

        # Get the number of available slots on the target site/queue
        totalSlots = self.getQueueSlots(queue, False)
        if totalSlots == 0:
          self.log.debug('%s: No slots available' % queue)
          continue

        # Note: comparing slots to job numbers is not accurate in multiprocessor case.
        #       This could lead to over submission.
        pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots))
        self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' %
                      (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit))

        # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
        pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit)

        while pilotsToSubmit > 0:
          self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue))

          bundleProxy = self.queueDict[queue].get('BundleProxy', False)
          jobExecDir = ''
          jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir)

          executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit,
                                                                bundleProxy=bundleProxy,
                                                                jobExecDir=jobExecDir,
                                                                processors=processors)
          result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors)
          # ## FIXME: The condor thing only transfers the file with some
          # ## delay, so when we unlink here the script is gone
          # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
          if ceType != 'HTCondorCE':
            os.unlink(executable)
          if not result['OK']:
            self.log.error('Failed submission to queue %s:\n' % queue, result['Message'])
            pilotsToSubmit = 0
            self.failedQueues[queue] += 1
            continue

          pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
          queueSubmittedPilots += pilotSubmissionChunk
          # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
          # task queue priorities
          pilotList = result['Value']
          self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
          totalSubmittedPilots += len(pilotList)
          self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName))
          stampDict = {}
          if 'PilotStampDict' in result:
            stampDict = result['PilotStampDict']
          tqPriorityList = []
          sumPriority = 0.
          for tq in tagTqIDList:
            sumPriority += taskQueueDict[tq]['Priority']
            tqPriorityList.append((tq, sumPriority))
          rndm = random.random() * sumPriority
          tqDict = {}
          for pilotID in pilotList:
            rndm = random.random() * sumPriority
            for tq, prio in tqPriorityList:
              if rndm < prio:
                tqID = tq
                break
            if tqID not in tqDict:
              tqDict[tqID] = []
            tqDict[tqID].append(pilotID)

          for tqID, pilotList in tqDict.items():
            result = pilotAgentsDB.addPilotTQReference(pilotList,
                                                       tqID,
                                                       self.pilotDN,
                                                       self.pilotGroup,
                                                       self.localhost,
                                                       ceType,
                                                       stampDict)
            if not result['OK']:
              self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message'])
              continue
            for pilot in pilotList:
              result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName,
                                                    'Successfully submitted by the SiteDirector',
                                                    siteName, queueName)
              if not result['OK']:
                self.log.error('Failed to set pilot status: ', result['Message'])
                continue

    self.log.info(
        "%d pilots submitted in total in this cycle, %d matched queues" %
        (totalSubmittedPilots, matchedQueues))
    return S_OK()