示例#1
0
 def _updateSiteMask( self, sitesData ):
   siteStatus = SiteStatus()
   siteMaskStatus = dict( sitesData )
   for site in siteMaskStatus:
     #
     #FIXME: we are only taking into account ComputingAccess
     #
     if siteStatus.isUsableSite( site, 'ComputingAccess' ):
       siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Allowed'
     else:
       siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Banned'
     sitesData[ site ][ 'siteMaskStatus' ] = siteMaskStatus[ site ][ 'siteMaskStatus' ]
   return S_OK( sitesData )
示例#2
0
 def _updateSiteMask(self, sitesData):
     siteStatus = SiteStatus()
     siteMaskStatus = dict(sitesData)
     for site in siteMaskStatus:
         #
         #FIXME: we are only taking into account ComputingAccess
         #
         if siteStatus.isUsableSite(site, 'ComputingAccess'):
             siteMaskStatus[site]['siteMaskStatus'] = 'Allowed'
         else:
             siteMaskStatus[site]['siteMaskStatus'] = 'Banned'
         sitesData[site]['siteMaskStatus'] = siteMaskStatus[site][
             'siteMaskStatus']
     return S_OK(sitesData)
示例#3
0
class SiteDirector(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """
    def initialize(self):
        """ Standard constructor
    """
        self.am_setOption("PollingTime", 60.0)
        self.am_setOption("maxPilotWaitingHours", 6)
        self.queueDict = {}
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
        self.siteStatus = SiteStatus()
        return S_OK()

    def beginExecution(self):

        self.gridEnv = self.am_getOption("GridEnv", getGridEnv())
        # The SiteDirector is for a particular user community
        self.vo = self.am_getOption("Community", '')
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", '')
        # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
        self.voGroups = []

        # Choose the group for which pilots will be submitted. This is a hack until
        # we will be able to match pilots to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result['OK']:
                    return result
                for group in result['Value']:
                    if 'NormalUser' in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericPilotCredentials(vo=self.vo)
        if not result['OK']:
            return result
        self.pilotDN, self.pilotGroup = result['Value']
        self.pilotDN = self.am_getOption("PilotDN", self.pilotDN)
        self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup)

        self.platforms = []
        self.sites = []
        self.defaultSubmitPools = ''
        if self.group:
            self.defaultSubmitPools = Registry.getGroupOption(
                self.group, 'SubmitPools', '')
        elif self.vo:
            self.defaultSubmitPools = Registry.getVOOption(
                self.vo, 'SubmitPools', '')

        self.pilot = self.am_getOption('PilotScript', DIRAC_PILOT)
        self.install = DIRAC_INSTALL
        self.workingDirectory = self.am_getOption('WorkDirectory')
        self.maxQueueLength = self.am_getOption('MaxQueueLength', 86400 * 3)
        self.pilotLogLevel = self.am_getOption('PilotLogLevel', 'INFO')
        self.maxJobsInFillMode = self.am_getOption('MaxJobsInFillMode',
                                                   self.maxJobsInFillMode)
        self.maxPilotsToSubmit = self.am_getOption('MaxPilotsToSubmit',
                                                   self.maxPilotsToSubmit)
        self.pilotWaitingFlag = self.am_getOption('PilotWaitingFlag', True)
        self.pilotWaitingTime = self.am_getOption('MaxPilotWaitingTime', 7200)

        # Flags
        self.updateStatus = self.am_getOption('UpdatePilotStatus', True)
        self.getOutput = self.am_getOption('GetPilotOutput', True)
        self.sendAccounting = self.am_getOption('SendPilotAccounting', True)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption('Site', 'Any').lower() == "any":
            siteNames = self.am_getOption('Site', [])
        ceTypes = None
        if not self.am_getOption('CETypes', 'Any').lower() == "any":
            ceTypes = self.am_getOption('CETypes', [])
        ces = None
        if not self.am_getOption('CEs', 'Any').lower() == "any":
            ces = self.am_getOption('CEs', [])

        self._resources = Resources.Resources(vo=self.vo)
        result = self._resources.getEligibleQueuesInfo(siteList=siteNames,
                                                       ceList=ces,
                                                       ceTypeList=ceTypes,
                                                       mode='Direct')
        if not result['OK']:
            return result
        resourceDict = result['Value']
        result = self.getQueues(resourceDict)
        if not result['OK']:
            return result

        #if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        #self.siteNames = siteNames

        if self.updateStatus:
            self.log.always('Pilot status update requested')
        if self.getOutput:
            self.log.always('Pilot output retrieval requested')
        if self.sendAccounting:
            self.log.always('Pilot accounting sending requested')

        self.log.always('Sites:', siteNames)
        self.log.always('CETypes:', ceTypes)
        self.log.always('CEs:', ces)
        self.log.always('PilotDN:', self.pilotDN)
        self.log.always('PilotGroup:', self.pilotGroup)
        self.log.always('MaxPilotsToSubmit:', self.maxPilotsToSubmit)
        self.log.always('MaxJobsInFillMode:', self.maxJobsInFillMode)

        self.localhost = socket.getfqdn()
        self.proxy = ''

        if self.queueDict:
            self.log.always("Agent will serve queues:")
            for queue in self.queueDict:
                self.log.always("Site: %s, CE: %s, Queue: %s" %
                                (self.queueDict[queue]['Site'],
                                 self.queueDict[queue]['CEName'], queue))

        return S_OK()

    def getQueues(self, resourceDict):
        """ Get the list of relevant CEs and their descriptions
    """

        self.queueDict = {}
        ceFactory = ComputingElementFactory()

        for site in resourceDict:
            result = self._resources.getSiteFullName(site)
            if not result['OK']:
                continue
            siteFullName = result['Value']
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                qDict = ceDict.pop('Queues')
                for queue in qDict:
                    queueName = '%s_%s' % (ce, queue)
                    self.queueDict[queueName] = {}
                    self.queueDict[queueName]['ParametersDict'] = qDict[queue]
                    self.queueDict[queueName]['ParametersDict'][
                        'Queue'] = queue
                    self.queueDict[queueName]['ParametersDict'][
                        'Site'] = siteFullName
                    self.queueDict[queueName]['ParametersDict'][
                        'GridEnv'] = self.gridEnv
                    self.queueDict[queueName]['ParametersDict'][
                        'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown')
                    # Evaluate the CPU limit of the queue according to the Glue convention
                    # To Do: should be a utility
                    if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
                       "SI00" in self.queueDict[queueName]['ParametersDict']:
                        maxCPUTime = float(self.queueDict[queueName]
                                           ['ParametersDict']['maxCPUTime'])
                        # For some sites there are crazy values in the CS
                        maxCPUTime = max(maxCPUTime, 0)
                        maxCPUTime = min(maxCPUTime, 86400 * 12.5)
                        si00 = float(self.queueDict[queueName]
                                     ['ParametersDict']['SI00'])
                        queueCPUTime = 60. / 250. * maxCPUTime * si00
                        self.queueDict[queueName]['ParametersDict'][
                            'CPUTime'] = int(queueCPUTime)
                    qwDir = os.path.join(self.workingDirectory, queue)
                    if not os.path.exists(qwDir):
                        os.makedirs(qwDir)
                    self.queueDict[queueName]['ParametersDict'][
                        'WorkingDirectory'] = qwDir

                    platform = ''
                    if "Platform" in self.queueDict[queueName][
                            'ParametersDict']:
                        platform = self.queueDict[queueName]['ParametersDict'][
                            'Platform']
                    elif "Platform" in ceDict:
                        platform = ceDict['Platform']
                    elif "OS" in ceDict:
                        architecture = ceDict.get('architecture', 'x86_64')
                        OS = ceDict['OS']
                        platform = '_'.join([architecture, OS])
                    if platform and not platform in self.platforms:
                        self.platforms.append(platform)

                    if not "Platform" in self.queueDict[queueName][
                            'ParametersDict'] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result['OK']:
                            self.queueDict[queueName]['ParametersDict'][
                                'Platform'] = result['Value']

                    ceQueueDict = dict(ceDict)
                    ceQueueDict.update(
                        self.queueDict[queueName]['ParametersDict'])
                    result = ceFactory.getCE(ceName=ce,
                                             ceType=ceDict['CEType'],
                                             ceParametersDict=ceQueueDict)
                    if not result['OK']:
                        return result
                    self.queueDict[queueName]['CE'] = result['Value']
                    self.queueDict[queueName]['CEName'] = ce
                    self.queueDict[queueName]['CEType'] = ceDict['CEType']
                    self.queueDict[queueName]['Site'] = siteFullName
                    self.queueDict[queueName]['QueueName'] = queue
                    self.queueDict[queueName]['Platform'] = platform
                    result = self.queueDict[queueName]['CE'].isValid()
                    if not result['OK']:
                        self.log.fatal(result['Message'])
                        return result
                    if 'BundleProxy' in self.queueDict[queueName][
                            'ParametersDict']:
                        self.queueDict[queueName]['BundleProxy'] = True
                    elif 'BundleProxy' in ceDict:
                        self.queueDict[queueName]['BundleProxy'] = True

                    if siteFullName not in self.sites:
                        self.sites.append(siteFullName)

        return S_OK()

    def execute(self):
        """ Main execution method
    """

        if not self.queueDict:
            self.log.warn('No site defined, exiting the cycle')
            return S_OK()

        result = self.submitJobs()
        if not result['OK']:
            self.log.error('Errors in the job submission: ', result['Message'])

        if self.updateStatus:
            result = self.updatePilotStatus()
            if not result['OK']:
                self.log.error('Errors in updating pilot status: ',
                               result['Message'])

        return S_OK()

    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = self.siteStatus.isUsableSite(siteName,
                                                    'ComputingAccess')
            platform = self.queueDict[queue]['Platform']

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: \n%s' %
                    (queue, result['Message']))
                continue
            ceInfoDict = result['CEInfoDict']
            self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                           ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                             ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

            totalSlots = result['Value']

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.info('No matching TQs found')
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                                    ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                os.unlink(executable)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        return S_OK()

#####################################################################################

    def __getExecutable(self,
                        queue,
                        pilotsToSubmit,
                        bundleProxy=True,
                        httpProxy='',
                        jobExecDir=''):
        """ Prepare the full executable for queue
    """

        proxy = None
        if bundleProxy:
            proxy = self.proxy
        pilotOptions, pilotsToSubmit = self.__getPilotOptions(
            queue, pilotsToSubmit)
        if pilotOptions is None:
            return S_ERROR('Errors in compiling pilot options')
        executable = self.__writePilotScript(self.workingDirectory,
                                             pilotOptions, proxy, httpProxy,
                                             jobExecDir)
        return S_OK([executable, pilotsToSubmit])

#####################################################################################

    def __getPilotOptions(self, queue, pilotsToSubmit):
        """ Prepare pilot options
    """

        queueDict = self.queueDict[queue]['ParametersDict']
        pilotOptions = []

        setup = gConfig.getValue("/DIRAC/Setup", "unknown")
        if setup == 'unknown':
            self.log.error('Setup is not defined in the configuration')
            return [None, None]
        pilotOptions.append('-S %s' % setup)
        opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup)

        #Installation defined?
        installationName = opsHelper.getValue("Pilot/Installation", "")
        if installationName:
            pilotOptions.append('-V %s' % installationName)

        #Project defined?
        projectName = opsHelper.getValue("Pilot/Project", "")
        if projectName:
            pilotOptions.append('-l %s' % projectName)
        else:
            self.log.info('DIRAC project will be installed by pilots')

        #Request a release
        diracVersion = opsHelper.getValue("Pilot/Version", [])
        if not diracVersion:
            self.log.error('Pilot/Version is not defined in the configuration')
            return [None, None]
        #diracVersion is a list of accepted releases. Just take the first one
        pilotOptions.append('-r %s' % diracVersion[0])

        ownerDN = self.pilotDN
        ownerGroup = self.pilotGroup
        # Request token for maximum pilot efficiency
        result = gProxyManager.requestToken(
            ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode)
        if not result['OK']:
            self.log.error('Invalid proxy token request', result['Message'])
            return [None, None]
        (token, numberOfUses) = result['Value']
        pilotOptions.append('-o /Security/ProxyToken=%s' % token)
        # Use Filling mode
        pilotOptions.append('-M %s' %
                            min(numberOfUses, self.maxJobsInFillMode))

        # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
        # with numberOfUses tokens we can submit at most:
        #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
        # pilots
        newPilotsToSubmit = numberOfUses / min(numberOfUses,
                                               self.maxJobsInFillMode)
        if newPilotsToSubmit != pilotsToSubmit:
            self.log.info(
                'Number of pilots to submit is changed to %d after getting the proxy token'
                % newPilotsToSubmit)
            pilotsToSubmit = newPilotsToSubmit
        # Debug
        if self.pilotLogLevel.lower() == 'debug':
            pilotOptions.append('-d')
        # CS Servers
        csServers = gConfig.getValue("/DIRAC/Configuration/Servers", [])
        pilotOptions.append('-C %s' % ",".join(csServers))

        # DIRAC Extensions to be used in pilots
        pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", [])
        extensionsList = []
        if pilotExtensionsList:
            if pilotExtensionsList[0] != 'None':
                extensionsList = pilotExtensionsList
        else:
            extensionsList = CSGlobals.getCSExtensions()
        if extensionsList:
            pilotOptions.append('-e %s' % ",".join(extensionsList))

        # Requested CPU time
        pilotOptions.append('-T %s' % queueDict['CPUTime'])
        # CEName
        pilotOptions.append('-N %s' % self.queueDict[queue]['CEName'])
        # SiteName
        pilotOptions.append('-n %s' % queueDict['Site'])
        if 'ClientPlatform' in queueDict:
            pilotOptions.append("-p '%s'" % queueDict['ClientPlatform'])

        if 'SharedArea' in queueDict:
            pilotOptions.append("-o '/LocalSite/SharedArea=%s'" %
                                queueDict['SharedArea'])

        if 'SI00' in queueDict:
            factor = float(queueDict['SI00']) / 250.
            pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor)
            pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" %
                                factor)
        else:
            if 'CPUScalingFactor' in queueDict:
                pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" %
                                    queueDict['CPUScalingFactor'])
            if 'CPUNormalizationFactor' in queueDict:
                pilotOptions.append(
                    "-o '/LocalSite/CPUNormalizationFactor=%s'" %
                    queueDict['CPUNormalizationFactor'])

        # Hack
        if self.defaultSubmitPools:
            pilotOptions.append(
                '-o /Resources/Computing/CEDefaults/SubmitPool=%s' %
                self.defaultSubmitPools)

        if self.group:
            pilotOptions.append('-G %s' % self.group)

        self.log.verbose("pilotOptions: ", ' '.join(pilotOptions))

        return [pilotOptions, pilotsToSubmit]


#####################################################################################

    def __writePilotScript(self,
                           workingDirectory,
                           pilotOptions,
                           proxy=None,
                           httpProxy='',
                           pilotExecDir=''):
        """ Bundle together and write out the pilot executable script, admixt the proxy if given
    """

        try:
            compressedAndEncodedProxy = ''
            proxyFlag = 'False'
            if proxy is not None:
                compressedAndEncodedProxy = base64.encodestring(
                    bz2.compress(proxy.dumpAllToString()['Value']))
                proxyFlag = 'True'
            compressedAndEncodedPilot = base64.encodestring(
                bz2.compress(open(self.pilot, "rb").read(), 9))
            compressedAndEncodedInstall = base64.encodestring(
                bz2.compress(open(self.install, "rb").read(), 9))
        except:
            self.log.exception(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )
            return S_ERROR(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )

        localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory )
  os.chdir( pilotWorkingDirectory )
  if %(proxyFlag)s:
    open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) )
    os.chmod("proxy",0600)
    os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) )
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % {
            'compressedAndEncodedProxy': compressedAndEncodedProxy,
            'compressedAndEncodedPilot': compressedAndEncodedPilot,
            'compressedAndEncodedInstall': compressedAndEncodedInstall,
            'httpProxy': httpProxy,
            'pilotExecDir': pilotExecDir,
            'pilotScript': os.path.basename(self.pilot),
            'installScript': os.path.basename(self.install),
            'pilotOptions': ' '.join(pilotOptions),
            'proxyFlag': proxyFlag
        }

        fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py',
                                    prefix='DIRAC_',
                                    dir=workingDirectory)
        pilotWrapper = os.fdopen(fd, 'w')
        pilotWrapper.write(localPilot)
        pilotWrapper.close()
        return name

    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']

            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'Status': TRANSIENT_PILOT_STATUS,
                'OwnerDN': self.pilotDN,
                'OwnerGroup': self.pilotGroup
            })
            if not result['OK']:
                self.log.error('Failed to select pilots: %s' %
                               result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]['PilotStamp']:
                    stampedPilotRefs.append(pRef + ":::" +
                                            pilotDict[pRef]['PilotStamp'])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result['OK']:
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 600)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots status from CE',
                               '%s: %s' % (ceName, result['Message']))
                continue
            pilotCEDict = result['Value']

            for pRef in pilotRefs:
                newStatus = ''
                oldStatus = pilotDict[pRef]['Status']
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = 'Aborted'
                elif ceStatus != 'Unknown':
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info('Updating status to %s for pilot %s' %
                                  (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(
                        pRef, newStatus, '', 'Updated by SiteDirector')
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]['OutputReady'].lower(
                    ) == 'false' and self.getOutput:
                        self.log.info('Retrieving output for pilot %s' % pRef)
                        pilotStamp = pilotDict[pRef]['PilotStamp']
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ':::' + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result['OK']:
                            self.log.error(
                                'Failed to get pilot output',
                                '%s: %s' % (ceName, result['Message']))
                        else:
                            output, error = result['Value']
                            if output:
                                result = pilotAgentsDB.storePilotOutput(
                                    pRef, output, error)
                                if not result['OK']:
                                    self.log.error(
                                        'Failed to store pilot output',
                                        result['Message'])
                            else:
                                self.log.warn(
                                    'Empty pilot output not stored to PilotDB')

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 1000)
                if not result['OK']:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']
            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'OutputReady': 'False',
                'Status': FINAL_PILOT_STATUS
            })

            if not result['OK']:
                self.log.error('Failed to select pilots', result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info('Retrieving output for pilot %s' % pRef)
                    pilotStamp = pilotDict[pRef]['PilotStamp']
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ':::' + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result['OK']:
                        self.log.error('Failed to get pilot output',
                                       '%s: %s' % (ceName, result['Message']))
                    else:
                        output, error = result['Value']
                        result = pilotAgentsDB.storePilotOutput(
                            pRef, output, error)
                        if not result['OK']:
                            self.log.error('Failed to store pilot output',
                                           result['Message'])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots({
                    'DestinationSite':
                    ceName,
                    'Queue':
                    queueName,
                    'GridType':
                    ceType,
                    'GridSite':
                    siteName,
                    'AccountingSent':
                    'False',
                    'Status':
                    FINAL_PILOT_STATUS
                })

                if not result['OK']:
                    self.log.error('Failed to select pilots',
                                   result['Message'])
                    continue
                pilotRefs = result['Value']
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result['OK']:
                    self.log.error('Failed to get pilots info from DB',
                                   result['Message'])
                    continue
                pilotDict = result['Value']
                result = self.sendPilotAccounting(pilotDict)
                if not result['OK']:
                    self.log.error('Failed to send pilot agent accounting')

        return S_OK()

    def sendPilotAccounting(self, pilotDict):
        """ Send pilot accounting record
    """
        for pRef in pilotDict:
            self.log.verbose('Preparing accounting record for pilot %s' % pRef)
            pA = PilotAccounting()
            pA.setEndTime(pilotDict[pRef]['LastUpdateTime'])
            pA.setStartTime(pilotDict[pRef]['SubmissionTime'])
            retVal = CS.getUsernameForDN(pilotDict[pRef]['OwnerDN'])
            if not retVal['OK']:
                userName = '******'
                self.log.error("Can't determine username for dn:",
                               pilotDict[pRef]['OwnerDN'])
            else:
                userName = retVal['Value']
            pA.setValueByKey('User', userName)
            pA.setValueByKey('UserGroup', pilotDict[pRef]['OwnerGroup'])
            result = getSiteForCE(pilotDict[pRef]['DestinationSite'])
            if result['OK'] and result['Value'].strip():
                pA.setValueByKey('Site', result['Value'].strip())
            else:
                pA.setValueByKey('Site', 'Unknown')
            pA.setValueByKey('GridCE', pilotDict[pRef]['DestinationSite'])
            pA.setValueByKey('GridMiddleware', pilotDict[pRef]['GridType'])
            pA.setValueByKey('GridResourceBroker', pilotDict[pRef]['Broker'])
            pA.setValueByKey('GridStatus', pilotDict[pRef]['Status'])
            if not 'Jobs' in pilotDict[pRef]:
                pA.setValueByKey('Jobs', 0)
            else:
                pA.setValueByKey('Jobs', len(pilotDict[pRef]['Jobs']))
            self.log.info("Adding accounting record for pilot %s" %
                          pilotDict[pRef]['PilotID'])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal['OK']:
                self.log.error('Failed to send accounting info for pilot ',
                               pRef)
            else:
                # Set up AccountingSent flag
                result = pilotAgentsDB.setAccountingFlag(pRef)
                if not result['OK']:
                    self.log.error('Failed to set accounting flag for pilot ',
                                   pRef)

        self.log.info('Committing accounting records for %d pilots' %
                      len(pilotDict))
        result = gDataStoreClient.commit()
        if result['OK']:
            for pRef in pilotDict:
                self.log.verbose('Setting AccountingSent flag for pilot %s' %
                                 pRef)
                result = pilotAgentsDB.setAccountingFlag(pRef)
                if not result['OK']:
                    self.log.error('Failed to set accounting flag for pilot ',
                                   pRef)
        else:
            return result

        return S_OK()
示例#4
0
class ResourceStatus( ElementStatus ):
  """
  ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps
  the connection to the db / server as an object member, to avoid creating a new
  one massively.
  """

  __metaclass__ = DIRACSingleton
  
  def __init__( self ):
    """
    Constructor, initializes the logger, rssClient and caches.

    examples
      >>> resourceStatus = ResourceStatus()
    """

    super( ResourceStatus, self ).__init__()
    
    self.siteStatus = SiteStatus()
    
    # We can set CacheLifetime and CacheHistory from CS, so that we can tune them.
    cacheLifeTime = int( RssConfiguration().getConfigCache() )
    
    # RSSCaches, one per elementType ( StorageElement, ComputingElement )
    # Should be generated on the fly, instead of being hardcoded ?
    self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache )
    self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache )

  #.............................................................................
  # ComputingElement methods

  def getComputingStatuses( self, ceNames, statusTypes = None ):
    """
    Method that queries the RSSCache for ComputingElement-Status-related information.
    If any of the inputs is None, it is interpreted as * ( all ).
    If match is positive, the output looks like:
      {
        computingElementA : { statusType1 : status1, statusType2 : status2 },
        computingElementB : { statusType1 : status1, statusType2 : status2 },
      }
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples:
      >>> resourceStatus.getComputingStatuses( 'ce207.cern.ch', None )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( 'RubbishCE', None )
          S_ERROR( ... )
      >>> resourceStaus.getComputingStatuses( 'ce207.cern.ch', 'all' )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( None, 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' },
                  ... } )

    :Parameters:
      **ceNames** - [ None, `string`, `list` ]
        name(s) of the computing elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    cacheMatch = self.ceCache.match( ceNames, statusTypes )
    if not cacheMatch[ 'OK' ]:
      return cacheMatch
    
    cacheMatch = cacheMatch[ 'Value' ]
    
    for ceName, ceDict in cacheMatch.items():
      
      if not self.__getSiteAccess( ceName, 'ComputingAccess' )[ 'OK' ]:
        
        cacheMatch[ ceName ] = dict( zip( ceDict.keys(), [ 'Banned' ] * len( ceDict ) ) )
          
    return S_OK( cacheMatch )

  def getComputingStatus( self, ceName, statusType ):
    """
    Given a ce and a statusType, it returns its status from the cache.
    
    examples:
      >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', 'all' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', None )
          S_ERROR( ... )

    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
  
    return self.getElementStatus( 'Computing', ceName, statusType )
  
  def isUsableComputing( self, ceName, statusType ):
    """
    Similar method to getComputingStatus. The difference is the output.
    Given a ce name, returns a bool if the ce is usable:
    status is Active or Degraded outputs True
    anything else outputs False
    
    examples:
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' )
          True
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' )
          False # May be banned
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', None )
          False
      >>> resourceStatus.isUsableComputing( 'RubbishCE', 'all' )
          False
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'RubbishAccess' )
          False
    
    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.isUsableElement( 'Computing', ceName, statusType )

  def getUsableComputings( self, statusType ):
    """
    For a given statusType, returns all computing elements that are usable: their
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples:
      >>> resourceStatus.getUsableComputings( 'all' )
          S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] )
      >>> resourceStatus.getUsableComputings( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableComputings( 'RubbishAccess' )
          S_ERROR( ... )
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.getUsableElements( 'Computing', statusType )

  #.............................................................................
  # StorageElement methods

  def getStorageStatuses( self, seNames, statusTypes = None ):
    """
    Method that queries the RSSCache for StorageElement-Status-related information.
    If any of the inputs is None, it is interpreted as * ( all ).
    If match is positive, the output looks like:
    {
      storageElementA : { statusType1 : status1, statusType2 : status2 },
      storageElementB : { statusType1 : status1, statusType2 : status2 },
    }
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples:
      >>> resourceStatus.getStorageStatuses( 'CERN-USER', None )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } } )
      >>> resourceStatus.getStorageStatuses( 'RubbishCE', None )
          S_ERROR( ... )
      >>> resourceStaus.getStorageStatuses( 'CERN-USER', 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } } )
      >>> resourceStatus.getStorageStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER' : { 'ReadAccess' : 'Active' } } )
      >>> resourceStatus.getStorageStatuses( None, 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER' : { 'ReadAccess' : 'Active' },
                  ... } )

    :Parameters:
      **seNames** - [ None, `string`, `list` ]
        name(s) of the storage elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
        
    :return: S_OK() || S_ERROR()
    """
    
    cacheMatch = self.seCache.match( seNames, statusTypes )
    if not cacheMatch[ 'OK' ]:
      return cacheMatch
    
    cacheMatch = cacheMatch[ 'Value' ]
    
    for seName, seDict in cacheMatch.items():
      
      if not self.__getSiteAccess( seName, 'StorageAccess' )[ 'OK' ]:
        
        cacheMatch[ seName ] = dict( zip( seDict.keys(), [ 'Banned' ] * len( seDict ) ) )
          
    return S_OK( cacheMatch )


  def getStorageStatus( self, seName, statusType ):
    """
    Given a se and a statusType, it returns its status from the cache.
    
    examples:
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None )
          S_ERROR( ... )
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
  
    return self.getElementStatus( 'Storage', seName, statusType )
  
  def isUsableStorage( self, seName, statusType ):
    """
    Similar method to getStorageStatus. The difference is the output.
    Given a se name, returns a bool if the se is usable:
    status is Active or Degraded outputs True
    anything else outputs False
    
    examples:
      >>> resourceStatus.isUsableStorage( 'CERN-USER', 'ReadAccess' )
          True
      >>> resourceStatus.isUsableStorage( 'CERN-ARCHIVE', 'ReadAccess' )
          False # May be banned
      >>> resourceStatus.isUsableStorage( 'CERN-USER', None )
          False
      >>> resourceStatus.isUsableStorage( 'RubbishCE', 'ReadAccess' )
          False
      >>> resourceStatus.isUsableStorage( 'CERN-USER', 'RubbishAccess' )
          False
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.isUsableElement( 'Storage', seName, statusType )

  def getUsableStorages( self, statusType ):
    """
    For a given statusType, returns all storage elements that are usable: their
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples:
      >>> resourceStatus.getUsableStorages( 'ReadAccess' )
          S_OK( [ 'CERN-USER', 'PIC-USER',... ] )
      >>> resourceStatus.getUsableStorages( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableStorages( 'RubbishAccess' )
          S_ERROR( ... )
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.getUsableElements( 'Storage', statusType )
  
  
  #.............................................................................
  # update Cache methods

  def _updateCECache( self ):
    """
    Method used to update the ComputingElementCache.
    """
    return self.__updateCache( 'Computing' )

  
  def _updateSECache( self ):
    """
    Method used to update the StorageElementCache.
    """
    return self.__updateCache( 'Storage' )
    
          
  #.............................................................................
  # Private methods
  

  def __updateCache( self, elementType ):

    meta = { 'columns' : [ 'Name', 'StatusType', 'Status' ] }
    rawCache = self.rssClient.selectStatusElement( 'Resource', 'Status',
                                                    elementType = elementType,
                                                    meta = meta )
    
    if not rawCache[ 'OK' ]:
      return rawCache
    return S_OK( self.getCacheDictFromRawData( rawCache[ 'Value' ] ) )  
  
  
  def __getSiteAccess( self, elementName, siteAccess ):
    """
    Method that given a resourceType and an elementName, finds the site name
    that owes it. Once that is done, the site access <siteAccess> is checked
    and returned.
    
    :Parameters:
      **resourceType** - `string`
        name of the resource type ( StorageElement, ComputingElement.. )
      **elementName** - `string`
        name of the resource of type <resourceType>
      **siteAccess** - `string`
        site access ( StorageAccess, ComputingAccess .. )
        
    :return: S_OK() || S_ERROR()
    """
    
    siteName = Resources.getSiteForResource( elementName )
    if not siteName[ 'OK' ]:
      return siteName
    siteName = siteName[ 'Value' ]
    
    if not self.siteStatus.isUsableSite( siteName, siteAccess ):
      return S_ERROR( 'Site %s is not usable for Computing' % siteName )
    
    return S_OK()
  

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
示例#5
0
class SiteDirector( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def initialize( self ):
    """ Standard constructor
    """
    self.am_setOption( "PollingTime", 60.0 )
    self.am_setOption( "maxPilotWaitingHours", 6 )
    self.queueDict = {}
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
    self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
    self.siteStatus = SiteStatus()
    return S_OK()

  def beginExecution( self ):

    self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() )
    # The SiteDirector is for a particular user community
    self.vo = self.am_getOption( "Community", '' )
    if not self.vo:
      self.vo = CSGlobals.getVO()
    # The SiteDirector is for a particular user group
    self.group = self.am_getOption( "Group", '' )
    # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
    self.voGroups = []

    # Choose the group for which pilots will be submitted. This is a hack until
    # we will be able to match pilots to VOs.
    if not self.group:
      if self.vo:
        result = Registry.getGroupsForVO( self.vo )
        if not result['OK']:
          return result
        for group in result['Value']:
          if 'NormalUser' in Registry.getPropertiesForGroup( group ):
            self.voGroups.append( group )
    else:
      self.voGroups = [ self.group ]

    result = findGenericPilotCredentials( vo = self.vo )
    if not result[ 'OK' ]:
      return result
    self.pilotDN, self.pilotGroup = result[ 'Value' ]
    self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN )
    self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup )

    self.platforms = []
    self.sites = []
    self.defaultSubmitPools = ''
    if self.group:
      self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' )
    elif self.vo:
      self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' )

    self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT )
    self.install = DIRAC_INSTALL
    self.workingDirectory = self.am_getOption( 'WorkDirectory' )
    self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 )
    self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' )
    self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode )
    self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit )
    self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True )
    self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 7200 )

    # Flags
    self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True )
    self.getOutput = self.am_getOption( 'GetPilotOutput', True )
    self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True )

    # Get the site description dictionary
    siteNames = None
    if not self.am_getOption( 'Site', 'Any' ).lower() == "any":
      siteNames = self.am_getOption( 'Site', [] )
    ceTypes = None
    if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any":
      ceTypes = self.am_getOption( 'CETypes', [] )
    ces = None
    if not self.am_getOption( 'CEs', 'Any' ).lower() == "any":
      ces = self.am_getOption( 'CEs', [] )
      
    self._resources = Resources.Resources( vo = self.vo )  
    result = self._resources.getEligibleQueuesInfo( siteList = siteNames,
                                                    ceList = ces,
                                                    ceTypeList = ceTypes,
                                                    mode = 'Direct' )
    if not result['OK']:
      return result
    resourceDict = result['Value']
    result = self.getQueues( resourceDict )
    if not result['OK']:
      return result

    #if not siteNames:
    #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
    #  if siteName == 'Unknown':
    #    return S_OK( 'No site specified for the SiteDirector' )
    #  else:
    #    siteNames = [siteName]
    #self.siteNames = siteNames

    if self.updateStatus:
      self.log.always( 'Pilot status update requested' )
    if self.getOutput:
      self.log.always( 'Pilot output retrieval requested' )
    if self.sendAccounting:
      self.log.always( 'Pilot accounting sending requested' )

    self.log.always( 'Sites:', siteNames )
    self.log.always( 'CETypes:', ceTypes )
    self.log.always( 'CEs:', ces )
    self.log.always( 'PilotDN:', self.pilotDN )
    self.log.always( 'PilotGroup:', self.pilotGroup )
    self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit )
    self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode )

    self.localhost = socket.getfqdn()
    self.proxy = ''

    if self.queueDict:
      self.log.always( "Agent will serve queues:" )
      for queue in self.queueDict:
        self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'],
                                                         self.queueDict[queue]['CEName'],
                                                         queue ) )

    return S_OK()

  def getQueues( self, resourceDict ):
    """ Get the list of relevant CEs and their descriptions
    """

    self.queueDict = {}
    ceFactory = ComputingElementFactory()

    for site in resourceDict:
      result = self._resources.getSiteFullName( site )
      if not result['OK']:
        continue
      siteFullName = result['Value']
      for ce in resourceDict[site]:
        ceDict = resourceDict[site][ce]
        qDict = ceDict.pop( 'Queues' )
        for queue in qDict:
          queueName = '%s_%s' % ( ce, queue )
          self.queueDict[queueName] = {}
          self.queueDict[queueName]['ParametersDict'] = qDict[queue]
          self.queueDict[queueName]['ParametersDict']['Queue'] = queue
          self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName
          self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv
          self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' )
          # Evaluate the CPU limit of the queue according to the Glue convention
          # To Do: should be a utility
          if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
             "SI00" in self.queueDict[queueName]['ParametersDict']:
            maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] )
            # For some sites there are crazy values in the CS
            maxCPUTime = max( maxCPUTime, 0 )
            maxCPUTime = min( maxCPUTime, 86400 * 12.5 )
            si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] )
            queueCPUTime = 60. / 250. * maxCPUTime * si00
            self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime )
          qwDir = os.path.join( self.workingDirectory, queue )
          if not os.path.exists( qwDir ):
            os.makedirs( qwDir )
          self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir

          platform = ''
          if "Platform" in self.queueDict[queueName]['ParametersDict']:
            platform = self.queueDict[queueName]['ParametersDict']['Platform']
          elif "Platform" in ceDict:
            platform = ceDict['Platform']
          elif "OS" in ceDict:
            architecture = ceDict.get( 'architecture', 'x86_64' )
            OS = ceDict['OS']
            platform = '_'.join( [architecture, OS] )
          if platform and not platform in self.platforms:
            self.platforms.append( platform )

          if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform:
            result = Resources.getDIRACPlatform( platform )
            if result['OK']:
              self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value']

          ceQueueDict = dict( ceDict )
          ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] )
          result = ceFactory.getCE( ceName = ce,
                                    ceType = ceDict['CEType'],
                                    ceParametersDict = ceQueueDict )
          if not result['OK']:
            return result
          self.queueDict[queueName]['CE'] = result['Value']
          self.queueDict[queueName]['CEName'] = ce
          self.queueDict[queueName]['CEType'] = ceDict['CEType']
          self.queueDict[queueName]['Site'] = siteFullName
          self.queueDict[queueName]['QueueName'] = queue
          self.queueDict[queueName]['Platform'] = platform
          result = self.queueDict[queueName]['CE'].isValid()
          if not result['OK']:
            self.log.fatal( result['Message'] )
            return result
          if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']:
            self.queueDict[queueName]['BundleProxy'] = True
          elif 'BundleProxy' in ceDict:
            self.queueDict[queueName]['BundleProxy'] = True

          if siteFullName not in self.sites:
            self.sites.append( siteFullName )

    return S_OK()

  def execute( self ):
    """ Main execution method
    """

    if not self.queueDict:
      self.log.warn( 'No site defined, exiting the cycle' )
      return S_OK()

    result = self.submitJobs()
    if not result['OK']:
      self.log.error( 'Errors in the job submission: ', result['Message'] )


    if self.updateStatus:
      result = self.updatePilotStatus()
      if not result['OK']:
        self.log.error( 'Errors in updating pilot status: ', result['Message'] )

    return S_OK()

  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()

#####################################################################################
  def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True, httpProxy = '', jobExecDir = '' ):
    """ Prepare the full executable for queue
    """

    proxy = None
    if bundleProxy:
      proxy = self.proxy
    pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit )
    if pilotOptions is None:
      return S_ERROR( 'Errors in compiling pilot options' )
    executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir )
    return S_OK( [ executable, pilotsToSubmit ] )

#####################################################################################
  def __getPilotOptions( self, queue, pilotsToSubmit ):
    """ Prepare pilot options
    """

    queueDict = self.queueDict[queue]['ParametersDict']
    pilotOptions = []

    setup = gConfig.getValue( "/DIRAC/Setup", "unknown" )
    if setup == 'unknown':
      self.log.error( 'Setup is not defined in the configuration' )
      return [ None, None ]
    pilotOptions.append( '-S %s' % setup )
    opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup )

    #Installation defined?
    installationName = opsHelper.getValue( "Pilot/Installation", "" )
    if installationName:
      pilotOptions.append( '-V %s' % installationName )

    #Project defined?
    projectName = opsHelper.getValue( "Pilot/Project", "" )
    if projectName:
      pilotOptions.append( '-l %s' % projectName )
    else:
      self.log.info( 'DIRAC project will be installed by pilots' )

    #Request a release
    diracVersion = opsHelper.getValue( "Pilot/Version", [] )
    if not diracVersion:
      self.log.error( 'Pilot/Version is not defined in the configuration' )
      return [ None, None ]
    #diracVersion is a list of accepted releases. Just take the first one
    pilotOptions.append( '-r %s' % diracVersion[0] )

    ownerDN = self.pilotDN
    ownerGroup = self.pilotGroup
    # Request token for maximum pilot efficiency
    result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode )
    if not result[ 'OK' ]:
      self.log.error( 'Invalid proxy token request', result['Message'] )
      return [ None, None ]
    ( token, numberOfUses ) = result[ 'Value' ]
    pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
    # Use Filling mode
    pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) )

    # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
    # with numberOfUses tokens we can submit at most: 
    #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    # pilots
    newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    if newPilotsToSubmit != pilotsToSubmit:
      self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit )
      pilotsToSubmit = newPilotsToSubmit
    # Debug
    if self.pilotLogLevel.lower() == 'debug':
      pilotOptions.append( '-d' )
    # CS Servers
    csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] )
    pilotOptions.append( '-C %s' % ",".join( csServers ) )
    
    # DIRAC Extensions to be used in pilots
    pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] )
    extensionsList = []
    if pilotExtensionsList: 
      if pilotExtensionsList[0] != 'None':
        extensionsList = pilotExtensionsList
    else:
      extensionsList = CSGlobals.getCSExtensions()
    if extensionsList:
      pilotOptions.append( '-e %s' % ",".join( extensionsList ) )
      
    # Requested CPU time
    pilotOptions.append( '-T %s' % queueDict['CPUTime'] )
    # CEName
    pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] )
    # SiteName
    pilotOptions.append( '-n %s' % queueDict['Site'] )
    if 'ClientPlatform' in queueDict:
      pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] )

    if 'SharedArea' in queueDict:
      pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] )

    if 'SI00' in queueDict:
      factor = float( queueDict['SI00'] ) / 250.
      pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor )
      pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor )
    else:
      if 'CPUScalingFactor' in queueDict:
        pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] )
      if 'CPUNormalizationFactor' in queueDict:
        pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] )

    # Hack
    if self.defaultSubmitPools:
      pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools )

    if self.group:
      pilotOptions.append( '-G %s' % self.group )

    self.log.verbose( "pilotOptions: ", ' '.join( pilotOptions ) )

    return [ pilotOptions, pilotsToSubmit ]

#####################################################################################
  def __writePilotScript( self, workingDirectory, pilotOptions, proxy = None, httpProxy = '', pilotExecDir = '' ):
    """ Bundle together and write out the pilot executable script, admixt the proxy if given
    """

    try:
      compressedAndEncodedProxy = ''
      proxyFlag = 'False'
      if proxy is not None:
        compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) )
        proxyFlag = 'True'
      compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) )
      compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) )
    except:
      self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' )
      return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' )

    localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory )
  os.chdir( pilotWorkingDirectory )
  if %(proxyFlag)s:
    open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) )
    os.chmod("proxy",0600)
    os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) )
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % { 'compressedAndEncodedProxy': compressedAndEncodedProxy,
        'compressedAndEncodedPilot': compressedAndEncodedPilot,
        'compressedAndEncodedInstall': compressedAndEncodedInstall,
        'httpProxy': httpProxy,
        'pilotExecDir': pilotExecDir,
        'pilotScript': os.path.basename( self.pilot ),
        'installScript': os.path.basename( self.install ),
        'pilotOptions': ' '.join( pilotOptions ),
        'proxyFlag': proxyFlag }

    fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir = workingDirectory )
    pilotWrapper = os.fdopen( fd, 'w' )
    pilotWrapper.write( localPilot )
    pilotWrapper.close()
    return name

  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'Status':TRANSIENT_PILOT_STATUS,
                                           'OwnerDN': self.pilotDN,
                                           'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 500 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        if oldStatus == ceStatus:
          # Status did not change, continue
          continue
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Pilot finished without reporting, consider it Aborted
          newStatus = 'Aborted'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()

  def sendPilotAccounting( self, pilotDict ):
    """ Send pilot accounting record
    """
    for pRef in pilotDict:
      self.log.verbose( 'Preparing accounting record for pilot %s' % pRef )
      pA = PilotAccounting()
      pA.setEndTime( pilotDict[pRef][ 'LastUpdateTime' ] )
      pA.setStartTime( pilotDict[pRef][ 'SubmissionTime' ] )
      retVal = CS.getUsernameForDN( pilotDict[pRef][ 'OwnerDN' ] )
      if not retVal[ 'OK' ]:
        userName = '******'
        self.log.error( "Can't determine username for dn:", pilotDict[pRef][ 'OwnerDN' ] )
      else:
        userName = retVal[ 'Value' ]
      pA.setValueByKey( 'User', userName )
      pA.setValueByKey( 'UserGroup', pilotDict[pRef][ 'OwnerGroup' ] )
      result = getSiteForCE( pilotDict[pRef][ 'DestinationSite' ] )
      if result['OK'] and result[ 'Value' ].strip():
        pA.setValueByKey( 'Site', result['Value'].strip() )
      else:
        pA.setValueByKey( 'Site', 'Unknown' )
      pA.setValueByKey( 'GridCE', pilotDict[pRef][ 'DestinationSite' ] )
      pA.setValueByKey( 'GridMiddleware', pilotDict[pRef][ 'GridType' ] )
      pA.setValueByKey( 'GridResourceBroker', pilotDict[pRef][ 'Broker' ] )
      pA.setValueByKey( 'GridStatus', pilotDict[pRef][ 'Status' ] )
      if not 'Jobs' in pilotDict[pRef]:
        pA.setValueByKey( 'Jobs', 0 )
      else:
        pA.setValueByKey( 'Jobs', len( pilotDict[pRef]['Jobs'] ) )
      self.log.info( "Adding accounting record for pilot %s" % pilotDict[pRef][ 'PilotID' ] )
      retVal = gDataStoreClient.addRegister( pA )
      if not retVal[ 'OK' ]:
        self.log.error( 'Failed to send accounting info for pilot ', pRef )
      else:
        # Set up AccountingSent flag
        result = pilotAgentsDB.setAccountingFlag( pRef )
        if not result['OK']:
          self.log.error( 'Failed to set accounting flag for pilot ', pRef )

    self.log.info( 'Committing accounting records for %d pilots' % len( pilotDict ) )
    result = gDataStoreClient.commit()
    if result['OK']:
      for pRef in pilotDict:
        self.log.verbose( 'Setting AccountingSent flag for pilot %s' % pRef )
        result = pilotAgentsDB.setAccountingFlag( pRef )
        if not result['OK']:
          self.log.error( 'Failed to set accounting flag for pilot ', pRef )
    else:
      return result

    return S_OK()
示例#6
0
class ResourceStatus(ElementStatus):
    """
  ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps
  the connection to the db / server as an object member, to avoid creating a new
  one massively.
  """

    __metaclass__ = DIRACSingleton

    def __init__(self):
        """
    Constructor, initializes the logger, rssClient and caches.

    examples
      >>> resourceStatus = ResourceStatus()    
    """

        super(ResourceStatus, self).__init__()

        self.siteStatus = SiteStatus()

        # We can set CacheLifetime and CacheHistory from CS, so that we can tune them.
        cacheLifeTime = int(RssConfiguration().getConfigCache())

        # RSSCaches, one per elementType ( StorageElement, ComputingElement )
        # Should be generated on the fly, instead of being hardcoded ?
        self.seCache = RSSCache('StorageElement', cacheLifeTime,
                                self._updateSECache)
        self.ceCache = RSSCache('ComputingElement', cacheLifeTime,
                                self._updateCECache)

    #.............................................................................
    # ComputingElement methods

    def getComputingElementStatuses(self, ceNames, statusTypes=None):
        """
    Method that queries the RSSCache for ComputingElement-Status-related information. 
    If any of the inputs is None, it is interpreted as * ( all ).
    
    If match is positive, the output looks like:
    { 
     computingElementA : { statusType1 : status1, statusType2 : status2 },
     computingElementB : { statusType1 : status1, statusType2 : status2 },
    }
    
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples
      >>> resourceStatus.getComputingElementStatuses( 'ce207.cern.ch', None )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } }  )
      >>> resourceStatus.getComputingElementStatuses( 'RubbishCE', None )
          S_ERROR( ... )            
      >>> resourceStaus.getComputingElementStatuses( 'ce207.cern.ch', 'all' )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } }  )    
      >>> resourceStatus.getComputingElementStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' } }  )
      >>> resourceStatus.getComputingElementStatuses( None, 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' },
                  ... }  )            

    :Parameters:
      **ceNames** - [ None, `string`, `list` ]
        name(s) of the computing elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
    
    :return: S_OK() || S_ERROR()                 
    """

        cacheMatch = self.ceCache.match(ceNames, statusTypes)
        if not cacheMatch['OK']:
            return cacheMatch

        cacheMatch = cacheMatch['Value']

        for ceName, ceDict in cacheMatch.items():

            if not self.__getSiteAccess('ComputingElement', ceName,
                                        'ComputingAccess')['OK']:

                cacheMatch[ceName] = dict(
                    zip(ceDict.keys(), ['Banned'] * len(ceDict)))

        return S_OK(cacheMatch)

    def getComputingElementStatus(self, ceName, statusType):
        """
    Given a ce and a statusType, it returns its status from the cache.
    
    examples
      >>> resourceStatus.getComputingElementStatus( 'ce207.cern.ch', 'all' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingElementStatus( 'ce207.cern.ch', None )
          S_ERROR( ... )
    
    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """

        return self.getElementStatus('ComputingElement', ceName, statusType)

    def isUsableComputingElement(self, ceName, statusType):
        """
    Similar method to getComputingElementStatus. The difference is the output.
    Given a ce name, returns a bool if the ce is usable: 
      status is Active or Degraded outputs True
      anything else outputs False
    
    examples
      >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'all' )
          True
      >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'all' )
          False # May be banned
      >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', None )
          False    
      >>> resourceStatus.isUsableComputingElement( 'RubbishCE', 'all' )
          False
      >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'RubbishAccess' )
          False        
    
    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()    
    """

        return self.isUsableElement('ComputingElement', ceName, statusType)

    def getUsableComputingElements(self, statusType):
        """
    For a given statusType, returns all computing elements that are usable: their 
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples
      >>> resourceStatus.getUsableComputingElements( 'all' )
          S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] )
      >>> resourceStatus.getUsableComputingElements( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableComputingElements( 'RubbishAccess' )
          S_ERROR( ... )    
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """

        return self.getUsableElements('ComputingElement', statusType)

    #.............................................................................
    # StorageElement methods

    def getStorageElementStatuses(self, seNames, statusTypes=None):
        """
    Method that queries the RSSCache for StorageElement-Status-related information. 
    If any of the inputs is None, it is interpreted as * ( all ).
    
    If match is positive, the output looks like:
    { 
     storageElementA : { statusType1 : status1, statusType2 : status2 },
     storageElementB : { statusType1 : status1, statusType2 : status2 },
    }
    
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples
      >>> resourceStatus.getStorageElementStatuses( 'CERN-USER', None )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } }  )
      >>> resourceStatus.getStorageElementStatuses( 'RubbishCE', None )
          S_ERROR( ... )            
      >>> resourceStaus.getStorageElementStatuses( 'CERN-USER', 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } }  )    
      >>> resourceStatus.getStorageElementStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER'  : { 'ReadAccess' : 'Active' } }  )
      >>> resourceStatus.getStorageElementStatuses( None, 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER' : { 'ReadAccess' : 'Active' },
                  ... }  )            

    :Parameters:
      **seNames** - [ None, `string`, `list` ]
        name(s) of the storage elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
    
    :return: S_OK() || S_ERROR()                 
    """

        cacheMatch = self.seCache.match(seNames, statusTypes)
        if not cacheMatch['OK']:
            return cacheMatch

        cacheMatch = cacheMatch['Value']

        for seName, seDict in cacheMatch.items():

            if not self.__getSiteAccess('StorageElement', seName,
                                        'StorageAccess')['OK']:

                cacheMatch[seName] = dict(
                    zip(seDict.keys(), ['Banned'] * len(seDict)))

        return S_OK(cacheMatch)

    def getStorageElementStatus(self, seName, statusType):
        """
    Given a se and a statusType, it returns its status from the cache.
    
    examples
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None )
          S_ERROR( ... )
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """

        return self.getElementStatus('StorageElement', seName, statusType)

    def isUsableStorageElement(self, seName, statusType):
        """
    Similar method to getStorageElementStatus. The difference is the output.
    Given a se name, returns a bool if the se is usable: 
      status is Active or Degraded outputs True
      anything else outputs False
    
    examples
      >>> resourceStatus.isUsableStorageElement( 'CERN-USER', 'ReadAccess' )
          True
      >>> resourceStatus.isUsableStorageElement( 'CERN-ARCHIVE', 'ReadAccess' )
          False # May be banned
      >>> resourceStatus.isUsableStorageElement( 'CERN-USER', None )
          False    
      >>> resourceStatus.isUsableStorageElement( 'RubbishCE', 'ReadAccess' )
          False
      >>> resourceStatus.isUsableStorageElement( 'CERN-USER', 'RubbishAccess' )
          False        
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()    
    """

        return self.isUsableElement('StorageElement', seName, statusType)

    def getUsableStorageElements(self, statusType):
        """
    For a given statusType, returns all storage elements that are usable: their 
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples
      >>> resourceStatus.getUsableStorageElements( 'ReadAccess' )
          S_OK( [ 'CERN-USER', 'PIC-USER',... ] )
      >>> resourceStatus.getUsableStorageElements( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableStorageElements( 'RubbishAccess' )
          S_ERROR( ... )    
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """

        return self.getUsableElements('StorageElement', statusType)

    #.............................................................................
    # Private methods

    def __getSiteAccess(self, resourceType, elementName, siteAccess):
        """
    Method that given a resourceType and an elementName, finds the site name
    that owes it. Once that is done, the site access <siteAccess> is checked
    and returned.
    
    :Parameters:
      **resourceType** - `string`
        name of the resource type ( StorageElement, ComputingElement.. )
      **elementName** - `string`
        name of the resource of type <resourceType>
      **siteAccess** - `string`
        site access ( StorageAccess, ComputingAccess .. )  
    
    :return: S_OK() || S_ERROR()    
    """

        siteName = Resources.getSiteForResource(resourceType, elementName)
        if not siteName['OK']:
            return siteName
        siteName = siteName['Value']

        if not self.siteStatus.isUsableSite(siteName, siteAccess):
            return S_ERROR('Site %s is not usable for Computing' % siteName)

        return S_OK()

    #.............................................................................
    #.............................................................................
    #.............................................................................
    #.............................................................................
    # Old code, to be deleted / refactored soon.

#  def getStorageElementStatus( self, elementName, statusType = None ):
#    """
#    Helper with dual access, tries to get information from the RSS for the given
#    StorageElement, otherwise, it gets it from the CS.
#
#    example:
#      >>> getStorageElementStatus( 'CERN-USER', 'Read' )
#          S_OK( { 'CERN-USER' : { 'Read': 'Active' } } )
#      >>> getStorageElementStatus( 'CERN-USER', 'Write' )
#          S_OK( { 'CERN-USER' : {'Read': 'Active', 'Write': 'Active', 'Check': 'Banned', 'Remove': 'Banned'}} )
#      >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' )
#          S_ERROR( xyz.. )
#      >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' )
#          S_OK( 'Unknown' )
#
#    """
#
#    if self.__getMode():
#      # We do not apply defaults. If is not on the cache, S_ERROR is returned.
#      return self.__getRSSStorageElementStatus( elementName, statusType )
#    else:
#      return self.__getCSStorageElementStatus( elementName, statusType )

# FIXME: to be deleted !!! ONLY RSS ( scripts, agents and web portal ) should set statuses
#  def setStorageElementStatus( self, elementName, statusType, status, reason = None,
#                               tokenOwner = None ):
#
#    """
#    Helper with dual access, tries set information in RSS and in CS.
#
#    example:
#      >>> getStorageElementStatus( 'CERN-USER', 'Read' )
#          S_OK( { 'Read': 'Active' } )
#      >>> getStorageElementStatus( 'CERN-USER', 'Write' )
#          S_OK( {'Read': 'Active', 'Write': 'Active', 'Check': 'Banned', 'Remove': 'Banned'} )
#      >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' )
#          S_ERROR( xyz.. )
#      >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType', 'Unknown' )
#          S_OK( 'Unknown' )
#    """
#
#    #if self.__getMode():
#    #return self.__setRSSStorageElementStatus( elementName, statusType, status, reason, tokenOwner )
#    #else:
#    #  return self.__setCSStorageElementStatus( elementName, statusType, status )

#.............................................................................
# update Cache methods

    def _updateCECache(self):
        """
      Method used to update the ComputingElementCache.
    """
        return self.__updateCache('ComputingElement')

    def _updateSECache(self):
        """
      Method used to update the StorageElementCache.
    """
        return self.__updateCache('StorageElement')

    def __updateCache(self, elementType):

        meta = {'columns': ['Name', 'StatusType', 'Status']}
        rawCache = self.rssClient.selectStatusElement('Resource',
                                                      'Status',
                                                      elementType=elementType,
                                                      meta=meta)

        if not rawCache['OK']:
            return rawCache
        return S_OK(self.getCacheDictFromRawData(rawCache['Value']))

    #.............................................................................
    #.............................................................................
    #.............................................................................
    #.............................................................................
    # TODO : delete all this


#  def __getRSSStorageElementStatus( self, elementName, statusType ):
#    """
#    Gets from the cache or the RSS the StorageElements status. The cache is a
#    copy of the DB table. If it is not on the cache, most likely is not going
#    to be on the DB.
#
#    There is one exception: item just added to the CS, e.g. new StorageElement.
#    The period between it is added to the DB and the changes are propagated
#    to the cache will be inconsisten, but not dangerous. Just wait <cacheLifeTime>
#    minutes.
#    """
#
#    siteAccess = self.__getSiteAccess( 'StorageElement', elementName, 'StorageAccess' )
#    if not siteAccess[ 'OK' ]:
#      self.log.error( siteAccess[ 'Message' ] )
#      return siteAccess
#
#    cacheMatch = self.seCache.match( elementName, statusType )
#
#    self.log.debug( '__getRSSStorageElementStatus' )
#    self.log.debug( cacheMatch )
#
#    return cacheMatch

#  def __getCSStorageElementStatus( self, elementName, statusType, default = None ):
#    """
#    Gets from the CS the StorageElements status
#    """
#
#    cs_path     = "/Resources/StorageElements"
#
#    if not isinstance( elementName, list ):
#      elementName = [ elementName ]
#
#    statuses = self.rssConfig.getConfigStatusType( 'StorageElement' )
#
#    result = {}
#    for element in elementName:
#
#      if statusType is not None:
#        # Added Active by default
#        res = gConfig.getOption( "%s/%s/%s" % ( cs_path, element, statusType ), 'Active' )
#        if res[ 'OK' ] and res[ 'Value' ]:
#          result[ element ] = { statusType : res[ 'Value' ] }
#
#      else:
#        res = gConfig.getOptionsDict( "%s/%s" % ( cs_path, element ) )
#        if res[ 'OK' ] and res[ 'Value' ]:
#          elementStatuses = {}
#          for elementStatusType, value in res[ 'Value' ].items():
#            if elementStatusType in statuses:
#              elementStatuses[ elementStatusType ] = value
#
#          # If there is no status defined in the CS, we add by default Read and
#          # Write as Active.
#          if elementStatuses == {}:
#            elementStatuses = { 'ReadAccess' : 'Active', 'WriteAccess' : 'Active' }
#
#          result[ element ] = elementStatuses
#
#    if result:
#      return S_OK( result )
#
#    if default is not None:
#
#      # sec check
#      if statusType is None:
#        statusType = 'none'
#
#      defList = [ [ el, statusType, default ] for el in elementName ]
#      return S_OK( getDictFromList( defList ) )
#
#    _msg = "StorageElement '%s', with statusType '%s' is unknown for CS."
#    return S_ERROR( _msg % ( elementName, statusType ) )

#  def __setRSSStorageElementStatus( self, elementName, statusType, status, reason, tokenOwner ):
#    """
#    Sets on the RSS the StorageElements status
#    """
#
#    expiration = datetime.datetime.utcnow() + datetime.timedelta( days = 1 )
#
#    self.seCache.acquireLock()
#    try:
#      res = self.rssClient.modifyStatusElement( 'Resource', 'Status', name = elementName,
#                                                statusType = statusType, status = status,
#                                                reason = reason, tokenOwner = tokenOwner,
#                                                tokenExpiration = expiration )
#      if res[ 'OK' ]:
#        self.seCache.refreshCache()
#
#      if not res[ 'OK' ]:
#        _msg = 'Error updating StorageElement (%s,%s,%s)' % ( elementName, statusType, status )
#        gLogger.warn( 'RSS: %s' % _msg )
#
#      return res
#
#    finally:
#      # Release lock, no matter what.
#      self.seCache.releaseLock()

#  def __setCSStorageElementStatus( self, elementName, statusType, status ):
#    """
#    Sets on the CS the StorageElements status
#    """
#
#    statuses = self.rssConfig.getConfigStatusType( 'StorageElement' )
#    if not statusType in statuses:
#      gLogger.error( "%s is not a valid statusType" % statusType )
#      return S_ERROR( "%s is not a valid statusType: %s" % ( statusType, statuses ) )
#
#    csAPI = CSAPI()
#
#    cs_path     = "/Resources/StorageElements"
#
#    csAPI.setOption( "%s/%s/%s" % ( cs_path, elementName, statusType ), status )
#
#    res = csAPI.commitChanges()
#    if not res[ 'OK' ]:
#      gLogger.warn( 'CS: %s' % res[ 'Message' ] )
#
#    return res

#  def __getMode( self ):
#    """
#      Get's flag defined ( or not ) on the RSSConfiguration. If defined as 1,
#      we use RSS, if not, we use CS.
#    """
#
#    res = self.rssConfig.getConfigState()
#
#    if res == 'Active':
#
#      if self.rssClient is None:
#        self.rssClient = ResourceStatusClient()
#      return True
#
#    self.rssClient = None
#    return False

################################################################################

#def getDictFromList( fromList ):
#  '''
#  Auxiliary method that given a list returns a dictionary of dictionaries:
#  { site1 : { statusType1 : st1, statusType2 : st2 }, ... }
#  '''
#
#  res = {}
#  for listElement in fromList:
#    site, sType, status = listElement
#    if not res.has_key( site ):
#      res[ site ] = {}
#    res[ site ][ sType ] = status
#  return res

#def getCacheDictFromRawData( rawList ):
#  """
#  Formats the raw data list, which we know it must have tuples of three elements.
#  ( element1, element2, element3 ) into a list of tuples with the format
#  ( ( element1, element2 ), element3 ). Then, it is converted to a dictionary,
#  which will be the new Cache.
#
#  It happens that element1 is elementName, element2 is statusType and element3
#  is status.
#
#  :Parameters:
#    **rawList** - `list`
#      list of three element tuples [( element1, element2, element3 ),... ]
#
#  :return: dict of the form { ( elementName, statusType ) : status, ... }
#  """
#
#  res = [ ( ( name, sType ), status ) for name, sType, status in rawList ]
#  return dict( res )

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
示例#7
0
  def getPilotSummaryWeb( self, selectDict, sortList, startItem, maxItems ):
    """ Get summary of the pilot jobs status by CE/site in a standard structure
    """

    stateNames = ['Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted']
    allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour']
    paramNames = ['Site', 'CE'] + allStateNames

    resultDict = {}
    last_update = None
    if selectDict.has_key( 'LastUpdateTime' ):
      last_update = selectDict['LastUpdateTime']
      del selectDict['LastUpdateTime']
    site_select = []
    if selectDict.has_key( 'GridSite' ):
      site_select = selectDict['GridSite']
      if type( site_select ) != type( [] ):
        site_select = [site_select]
      del selectDict['GridSite']

    status_select = []
    if selectDict.has_key( 'Status' ):
      status_select = selectDict['Status']
      if type( status_select ) != type( [] ):
        status_select = [status_select]
      del selectDict['Status']

    expand_site = ''
    if selectDict.has_key( 'ExpandSite' ):
      expand_site = selectDict['ExpandSite']
      site_select = [expand_site]
      del selectDict['ExpandSite']

    start = time.time()
    # Get all the data from the database with various selections
    result = self.getCounters( 'PilotAgents',
                              ['GridSite', 'DestinationSite', 'Status'],
                              selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not result['OK']:
      return result

    last_update = Time.dateTime() - Time.hour
    selectDict['Status'] = 'Aborted'
    resultHour = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultHour['OK']:
      return resultHour

    last_update = Time.dateTime() - Time.day
    selectDict['Status'] = ['Aborted', 'Done']
    resultDay = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultDay['OK']:
      return resultDay
    selectDict['CurrentJobID'] = 0
    selectDict['Status'] = 'Done'
    resultDayEmpty = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultDayEmpty['OK']:
      return resultDayEmpty

    ceMap = {}
    resMap = getCESiteMapping()
    if resMap['OK']:
      ceMap = resMap['Value']

    # Sort out different counters
    resultDict = {}
    resultDict['Unknown'] = {}
    for attDict, count in result['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if not resultDict.has_key( site ):
        resultDict[site] = {}
      if not resultDict[site].has_key( ce ):
        resultDict[site][ce] = {}
        for p in allStateNames:
          resultDict[site][ce][p] = 0

      resultDict[site][ce][state] = count

    for attDict, count in resultDay['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Done":
        resultDict[site][ce]["Done"] = count
      if state == "Aborted":
        resultDict[site][ce]["Aborted"] = count

    for attDict, count in resultDayEmpty['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Done":
        resultDict[site][ce]["Done_Empty"] = count

    for attDict, count in resultHour['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Aborted":
        resultDict[site][ce]["Aborted_Hour"] = count

    records = []
    siteSumDict = {}
    for site in resultDict:
      sumDict = {}
      for state in allStateNames:
        if not sumDict.has_key( state ):
          sumDict[state] = 0
      sumDict['Total'] = 0
      for ce in resultDict[site]:
        itemList = [site, ce]
        total = 0
        for state in allStateNames:
          itemList.append( resultDict[site][ce][state] )
          sumDict[state] += resultDict[site][ce][state]
          if state == "Done":
            done = resultDict[site][ce][state]
          if state == "Done_Empty":
            empty = resultDict[site][ce][state]
          if state == "Aborted":
            aborted = resultDict[site][ce][state]
          if state == "Aborted_Hour":
            aborted_hour = resultDict[site][ce][state]
          if state != "Aborted_Hour" and state != "Done_Empty":
            total += resultDict[site][ce][state]

        sumDict['Total'] += total
        # Add the total number of pilots seen in the last day
        itemList.append( total )
        # Add pilot submission efficiency evaluation
        if ( done - empty ) > 0:
          eff = float( done ) / float( done - empty )
        elif done == 0:
          eff = 0.
        elif empty == done:
          eff = 99.
        else:
          eff = 0.
        itemList.append( '%.2f' % eff )
        # Add pilot job efficiency evaluation
        if total > 0:
          eff = float( total - aborted ) / float( total ) * 100.
        else:
          eff = 100.
        itemList.append( '%.2f' % eff )

        # Evaluate the quality status of the CE
        if total > 10:
          if eff < 25.:
            itemList.append( 'Bad' )
          elif eff < 60.:
            itemList.append( 'Poor' )
          elif eff < 85.:
            itemList.append( 'Fair' )
          else:
            itemList.append( 'Good' )
        else:
          itemList.append( 'Idle' )

        if len( resultDict[site] ) == 1 or expand_site:
          records.append( itemList )

      if len( resultDict[site] ) > 1 and not expand_site:
        itemList = [site, 'Multiple']
        for state in allStateNames + ['Total']:
          if sumDict.has_key( state ):
            itemList.append( sumDict[state] )
          else:
            itemList.append( 0 )
        done = sumDict["Done"]
        empty = sumDict["Done_Empty"]
        aborted = sumDict["Aborted"]
        aborted_hour = sumDict["Aborted_Hour"]
        total = sumDict["Total"]

        # Add pilot submission efficiency evaluation
        if ( done - empty ) > 0:
          eff = float( done ) / float( done - empty )
        elif done == 0:
          eff = 0.
        elif empty == done:
          eff = 99.
        else:
          eff = 0.
        itemList.append( '%.2f' % eff )
        # Add pilot job efficiency evaluation
        if total > 0:
          eff = float( total - aborted ) / float( total ) * 100.
        else:
          eff = 100.
        itemList.append( '%.2f' % eff )

        # Evaluate the quality status of the Site
        if total > 10:
          if eff < 25.:
            itemList.append( 'Bad' )
          elif eff < 60.:
            itemList.append( 'Poor' )
          elif eff < 85.:
            itemList.append( 'Fair' )
          else:
            itemList.append( 'Good' )
        else:
          itemList.append( 'Idle' )
        records.append( itemList )

      for state in allStateNames + ['Total']:
        if not siteSumDict.has_key( state ):
          siteSumDict[state] = sumDict[state]
        else:
          siteSumDict[state] += sumDict[state]

    # Perform site selection
    if site_select:
      new_records = []
      for r in records:
        if r[0] in site_select:
          new_records.append( r )
      records = new_records

    # Perform status selection
    if status_select:
      new_records = []
      for r in records:
        if r[14] in status_select:
          new_records.append( r )
      records = new_records

    # Get the Site Mask data
    siteStatus = SiteStatus()
    for r in records:
      #
      #FIXME: using only ComputingAccess
      #
      if siteStatus.isUsableSite( r[0], 'ComputingAccess' ):
        r.append('Yes')
      else:
        r.append('No')

    finalDict = {}
    finalDict['TotalRecords'] = len( records )
    finalDict['ParameterNames'] = paramNames + \
                                 ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask']

    # Return all the records if maxItems == 0 or the specified number otherwise
    if maxItems:
      finalDict['Records'] = records[startItem:startItem + maxItems]
    else:
      finalDict['Records'] = records

    done = siteSumDict["Done"]
    empty = siteSumDict["Done_Empty"]
    aborted = siteSumDict["Aborted"]
    aborted_hour = siteSumDict["Aborted_Hour"]
    total = siteSumDict["Total"]

    # Add pilot submission efficiency evaluation
    if ( done - empty ) > 0:
      eff = float( done ) / float( done - empty )
    elif done == 0:
      eff = 0.
    elif empty == done:
      eff = 99.
    else:
      eff = 0.
    siteSumDict['PilotsPerJob'] = '%.2f' % eff
    # Add pilot job efficiency evaluation
    if total > 0:
      eff = float( total - aborted ) / float( total ) * 100.
    else:
      eff = 100.
    siteSumDict['PilotJobEff'] = '%.2f' % eff

    # Evaluate the overall quality status
    if total > 100:
      if eff < 25.:
        siteSumDict['Status'] = 'Bad'
      elif eff < 60.:
        siteSumDict['Status'] = 'Poor'
      elif eff < 85.:
        siteSumDict['Status'] = 'Fair'
      else:
        siteSumDict['Status'] = 'Good'
    else:
      siteSumDict['Status'] = 'Idle'
    finalDict['Extras'] = siteSumDict

    return S_OK( finalDict )