Exemplo n.º 1
0
  def __checkLoggingInfo( self, jobID, jobDict ):
    """ Get info from JobLogging
"""
    logList = []
    result = self.logDB.getJobLoggingInfo( jobID )
    if result['OK']:
      logList = result['Value']

    startTime = jobDict['StartExecTime']
    if not startTime or startTime == 'None':
      # status, minor, app, stime, source
      for items in logList:
        if items[0] == 'Running':
          startTime = items[3]
          break
      if not startTime or startTime == 'None':
        startTime = jobDict['SubmissionTime']

    if type( startTime ) in types.StringTypes:
      startTime = fromString( startTime )
      if startTime == None:
        self.log.error( 'Wrong timestamp in DB', items[3] )
        startTime = dateTime()


    endTime = dateTime()
    # status, minor, app, stime, source
    for items in logList:
      if items[0] == 'Stalled':
        endTime = fromString( items[3] )
    if endTime == None:
      self.log.error( 'Wrong timestamp in DB', items[3] )
      endTime = dateTime()

    return startTime, endTime
Exemplo n.º 2
0
    def _checkLoggingInfo(self, jobID, jobDict):
        """Get info from JobLogging"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result["OK"]:
            logList = result["Value"]

        startTime = jobDict["StartExecTime"]
        if not startTime or startTime == "None":
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == "Running":
                    startTime = items[3]
                    break
            if not startTime or startTime == "None":
                startTime = jobDict["SubmissionTime"]

        if isinstance(startTime, six.string_types):
            startTime = fromString(startTime)
            if startTime is None:
                self.log.error("Wrong timestamp in DB", items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == "Stalled":
                endTime = fromString(items[3])
        if endTime is None:
            self.log.error("Wrong timestamp in DB", items[3])
            endTime = dateTime()

        return startTime, endTime
Exemplo n.º 3
0
    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result['OK']:
            logList = result['Value']

        startTime = jobDict['StartExecTime']
        if not startTime or startTime == 'None':
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == 'Running':
                    startTime = items[3]
                    break
            if not startTime or startTime == 'None':
                startTime = jobDict['SubmissionTime']

        if type(startTime) in types.StringTypes:
            startTime = fromString(startTime)
            if startTime == None:
                self.log.error('Wrong timestamp in DB', items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == 'Stalled':
                endTime = fromString(items[3])
        if endTime == None:
            self.log.error('Wrong timestamp in DB', items[3])
            endTime = dateTime()

        return startTime, endTime
Exemplo n.º 4
0
  def execute(self):
    """ The main agent execution method
    """
    limitDate = toString(dateTime() - self.period)
    limitDate = limitDate[:limitDate.find('.')]

    commonString = 'FROM MessageRepository WHERE messageTime <'
    cmd = "SELECT count(*) %s '%s'" % (commonString, limitDate)
    result = self.SystemLoggingDB._query(cmd)
    if not result['OK']:
      return result
    recordsToErase = result['Value'][0][0]

    if recordsToErase == 0:
      self.log.info('No records to erase')
      return S_OK('No records to erase')

    cmd = "DELETE LOW_PRIORITY %s '%s'" % (commonString, limitDate)
    result = self.SystemLoggingDB._update(cmd)
    if not result['OK']:
      self.log.error('Could not erase the requested records',
                     'those older than %s' % limitDate)
      return result

    self.log.info('%s records have been erased' % recordsToErase)
    return result
Exemplo n.º 5
0
  def execute(self):
    """ Main execution method
    """
    
    toDate = dateTime() - day*self.graceRemovalPeriod
    result = self.requestClient.selectRequests({'Status':'Done','ToDate':str(toDate)})
    if not result['OK']:
      return result
    requestDict = result['Value']
    for rID,rName in requestDict.items():

      gLogger.verbose("Removing request %s" % rName)
      result = self.requestClient.deleteRequest(rName)
      if not result['OK']:
        gLogger.error('Failed to delete request %s' % rName, result['Message'])
      else:
        gLogger.info('Successfully removed request %d/%s' % (rID,rName) )
      
    if self.checkAssigned:
      pass

    if self.ftsCleaning:
      pass
    
    return S_OK()  
Exemplo n.º 6
0
    def __failCompletedJobs(self):
        """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the 
      finalization of the job execution.
    """

        # Get old Completed Jobs
        checkTime = str(dateTime() - self.completedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Completed'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if not jobIDs:
            return S_OK()

        # Remove those with Minor Status "Pending Requests"
        for jobID in jobIDs:
            result = self.jobDB.getJobAttribute(jobID, 'MinorStatus')
            if not result['OK']:
                self.log.error(result['Message'])
                continue
            if result['Value'] == "Pending Requests":
                continue

            result = self.__updateJobStatus(jobID, 'Failed',
                                            "Job died during finalization")
            result = self.__sendAccounting(jobID)
            if not result['OK']:
                self.log.error(result['Message'])
                continue

        return S_OK()
Exemplo n.º 7
0
  def __failCompletedJobs( self ):
    """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the 
      finalization of the job execution.
    """

    # Get old Completed Jobs
    checkTime = str( dateTime() - self.completedTime * second )
    result = self.jobDB.selectJobs( {'Status':'Completed'}, older = checkTime )
    if not result['OK']:
      self.log.error( result['Message'] )
      return result

    jobIDs = result['Value']
    if not jobIDs:
      return S_OK()

    # Remove those with Minor Status "Pending Requests"
    for jobID in jobIDs:
      result = self.jobDB.getJobAttribute( jobID, 'MinorStatus' )
      if not result['OK']:
        self.log.error( result['Message'] )
        continue
      if result['Value'] == "Pending Requests":
        continue

      result = self.__updateJobStatus( jobID, 'Failed',
                                       "Job died during finalization" )
      result = self.__sendAccounting( jobID )
      if not result['OK']:
        self.log.error( result['Message'] )
        continue

    return S_OK()
Exemplo n.º 8
0
    def _failSubmittingJobs(self):
        """ Failed Jobs stuck in Submitting Status for a long time.
        They are due to a failed bulk submission transaction.
    """

        # Get old Submitting Jobs
        checkTime = str(dateTime() - self.submittingTime * second)
        result = self.jobDB.selectJobs({'Status': 'Submitting'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if not jobIDs:
            return S_OK()

        for jobID in jobIDs:
            result = self.__updateJobStatus(jobID, 'Failed')
            if not result['OK']:
                self.log.error('Failed to update job status',
                               result['Message'])
                continue

        return S_OK()
Exemplo n.º 9
0
    def FillMessageRepository(self):
        """This function fills the MessageRepository with random values.
       It could be useful to test performance of the database.
    """
        self.__CreateAuxiliaryLists()
        LogLevels = [
            'ALWAYS', 'INFO', 'VERB', 'DEBUG', 'WARN', 'ERROR', 'EXCEPT',
            'FATAL'
        ]
        initialDate = dateTime()

        for i in range(1, 800):
            limitDate = toString(initialDate - randrange(0, 1680) * hour -
                                 randrange(0, 60) * minute -
                                 randrange(0, 60) * second)
            message = tupleToMessage([
                self.systemNames[randrange(0, 5)], LogLevels[randrange(0, 8)],
                limitDate, self.fixedMessages[randrange(0, 6)],
                'variable text %s' % randrange(0, 6), '',
                self.subSystemNames[randrange(0,
                                              5)], self.sites[randrange(0, 5)]
            ])
            userId = randrange(0, 12)
            result = self.insertMessageIntoDB(message, self.users[userId][0],
                                              self.users[userId][1],
                                              self.clientIPs[randrange(0, 20)])
            if not result['OK']:
                print result['Value']
Exemplo n.º 10
0
    def execute(self):
        """ The main agent execution method
    """
        limitDate = toString(dateTime() - self.period)
        limitDate = limitDate[:limitDate.find('.')]

        commonString = 'FROM MessageRepository WHERE messageTime <'
        cmd = "SELECT count(*) %s '%s'" % (commonString, limitDate)
        result = self.SystemLoggingDB._query(cmd)
        if not result['OK']:
            return result
        recordsToErase = result['Value'][0][0]

        if recordsToErase == 0:
            self.log.info('No records to erase')
            return S_OK('No records to erase')

        cmd = "DELETE LOW_PRIORITY %s '%s'" % (commonString, limitDate)
        result = self.SystemLoggingDB._update(cmd)
        if not result['OK']:
            self.log.error('Could not erase the requested records',
                           'those older than %s' % limitDate)
            return result

        self.log.info('%s records have been erased' % recordsToErase)
        return result
Exemplo n.º 11
0
  def FillMessageRepository(self):
    """This function fills the MessageRepository with random values.
       It could be useful to test performance of the database.
    """
    self.__CreateAuxiliaryLists()
    LogLevels = [ 'ALWAYS' , 'INFO', 'VERB', 'DEBUG', 'WARN',
                  'ERROR', 'EXCEPT', 'FATAL' ]
    initialDate=dateTime()

    for _i in range( 1, 800 ):
      limitDate = toString( initialDate - randrange(0,1680) * hour -
                            randrange( 0, 60) * minute -
                            randrange( 0, 60) * second )
      message = tupleToMessage ( [ self.systemNames[ randrange( 0, 5 ) ],
                          LogLevels[ randrange( 0, 8 ) ], limitDate,
                          self.fixedMessages[ randrange( 0, 6 ) ],
                          'variable text %s' % randrange( 0, 6 ), '',
                          self.subSystemNames[ randrange( 0, 5 ) ],
                          self.sites[ randrange( 0, 5 ) ] ] )
      userId = randrange( 0, 12 )
      result = self.insertMessageIntoDB( message, self.users[ userId ][ 0 ],
                                         self.users[ userId ][ 1 ],
                                         self.clientIPs[ randrange( 0, 20 ) ] )
      if not result['OK']:
        print result['Value']
Exemplo n.º 12
0
    def __failCompletedJobs(self):
        """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the
      finalization of the job execution.
    """

        # Get old Completed Jobs
        checkTime = str(dateTime() - self.completedTime * second)
        result = self.jobDB.selectJobs({"Status": "Completed"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if not jobIDs:
            return S_OK()

        # Remove those with Minor Status "Pending Requests"
        for jobID in jobIDs:
            result = self.jobDB.getJobAttributes(jobID, ["Status", "MinorStatus"])
            if not result["OK"]:
                self.log.error("Failed to get job attributes", result["Message"])
                continue
            if result["Value"]["Status"] != "Completed":
                continue
            if result["Value"]["MinorStatus"] == "Pending Requests":
                continue

            result = self.__updateJobStatus(jobID, "Failed", "Job died during finalization")
            result = self.__sendAccounting(jobID)
            if not result["OK"]:
                self.log.error("Failed to send accounting", result["Message"])
                continue

        return S_OK()
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if '*' in component:
            if component == '*':
                result = InstallTools.getSetupComponents()
                if result['OK']:
                    for ctype in ['Services', 'Agents']:
                        if ctype in result['Value']:
                            for sname in result['Value'][ctype]:
                                for cname in result['Value'][ctype][sname]:
                                    componentList.append('/'.join(
                                        [sname, cname]))
        elif type(component) in StringTypes:
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for c in componentList:
            if not '/' in c:
                continue
            system, cname = c.split('/')

            startDir = InstallTools.startDir
            currentLog = startDir + '/' + system + '_' + cname + '/log/current'
            logFile = file(currentLog, 'r')
            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ''
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    timeStamp = fromString(fields[0] + ' ' + fields[1])
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split('ERROR:')[-1].strip()

            resultDict[c] = {
                'ErrorsHour': errors_1,
                'ErrorsDay': errors_24,
                'LastError': lastError
            }

        return S_OK(resultDict)
Exemplo n.º 14
0
    def __processDir(self, dirPath, dirMetadata):
        ''' calculate nb of files and size of :dirPath:, remove it if it's empty '''
        subDirs = dirMetadata['SubDirs']
        closedDirs = dirMetadata['ClosedDirs']
        ##############################
        # FIXME: Until we understand while closed dirs are not working...
        ##############################
        closedDirs = []
        prStr = "%s: found %s sub-directories" % (dirPath, len(subDirs)
                                                  if subDirs else 'no')
        if closedDirs:
            prStr += ", %s are closed (ignored)" % len(closedDirs)
        for rmDir in closedDirs + self.__ignoreDirsList:
            subDirs.pop(rmDir, None)
        numberOfFiles = long(dirMetadata['Files'])
        totalSize = long(dirMetadata['TotalSize'])
        if numberOfFiles:
            prStr += " and %s files (%s bytes)" % (numberOfFiles, totalSize)
        else:
            prStr += " and no files"
        self.log.notice(prStr)
        if closedDirs:
            self.log.verbose("Closed dirs:\n %s" % '\n'.join(closedDirs))
        siteUsage = dirMetadata['SiteUsage']
        if numberOfFiles > 0:
            dirData = {
                'Files': numberOfFiles,
                'TotalSize': totalSize,
                'SEUsage': siteUsage
            }
            self.__addDirToPublishQueue(dirPath, dirData)
            # Print statistics
            self.log.verbose(
                "%-40s %20s %20s" %
                ('Storage Element', 'Number of files', 'Total size'))
            for storageElement in sorted(siteUsage):
                usageDict = siteUsage[storageElement]
                self.log.verbose(
                    "%-40s %20s %20s" % (storageElement, str(
                        usageDict['Files']), str(usageDict['Size'])))
        # If it's empty delete it
        elif len(subDirs) == 0 and len(closedDirs) == 0:
            if dirPath != self.__baseDir:
                self.removeEmptyDir(dirPath)
                return
        # We don't need the cached information about owner
        self.__directoryOwners.pop(dirPath, None)
        rightNow = dateTime()
        chosenDirs = [
            subDir
            for subDir in subDirs if not self.activePeriod or timeInterval(
                subDirs[subDir], self.activePeriod * week).includes(rightNow)
        ]

        self.__dirExplorer.addDirList(chosenDirs)
        self.__processedDirs += 1
Exemplo n.º 15
0
  def __getToken2(self):
    """Get the Keystone token for the version v2 of the keystone service

    :return: S_OK(token) or S_ERROR
    """

    user = self.parameters.get('User')
    password = self.parameters.get('Password')
    authArgs = {}
    if user and password:
      authDict = {'auth': {"passwordCredentials": {"username": user,
                                                   "password": password}
                           }
                  }
      if self.project:
        authDict['auth']['tenantName'] = self.project
    elif self.parameters.get('Auth') == "voms":
      authDict = {'auth': {'voms': True}}
      if self.project:
        authDict['auth']['tenantName'] = self.project

      if self.parameters.get('Proxy'):
        authArgs['cert'] = self.parameters.get('Proxy')

    try:
      result = requests.post("%s/tokens" % self.url,
                             headers={"Content-Type": "application/json"},
                             json=authDict,
                             verify=self.caPath,
                             **authArgs)
    except Exception as exc:
      return S_ERROR('Exception getting keystone token: %s' % str(exc))

    output = result.json()

    if result.status_code in [400, 401]:
      message = "None"
      if 'error' in output:
        message = output['error'].get('message')
      return S_ERROR('Authorization error: %s' % message)

    self.token = str(output['access']['token']['id'])
    expires = fromString(str(output['access']['token']['expires']).replace('T', ' ').replace('Z', ''))
    issued = fromString(str(output['access']['token']['issued_at']).replace('T', ' ').replace('Z', ''))
    self.expires = dateTime() + (expires - issued)

    self.projectID = output['access']['token']['tenant']['id']

    for endpoint in output['access']['serviceCatalog']:
      if endpoint['type'] == 'compute':
        self.computeURL = str(endpoint['endpoints'][0]['publicURL'])
      elif endpoint['type'] == 'image':
        self.imageURL = str(endpoint['endpoints'][0]['publicURL'])
      elif endpoint['type'] == 'network':
        self.networkURL = str(endpoint['endpoints'][0]['publicURL'])
    return S_OK(self.token)
Exemplo n.º 16
0
    def __kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

        message = ''

        checkTime = str(dateTime() - self.matchedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Matched status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                message = 'Failed to reschedule %d jobs stuck in Matched status' % len(
                    result['FailedJobs'])

        checkTime = str(dateTime() - self.rescheduledTime * second)
        result = self.jobDB.selectJobs({'Status': 'Rescheduled'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Rescheduled status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                if message:
                    message += '\n'
                message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len(
                    result['FailedJobs'])

        if message:
            return S_ERROR(message)
        else:
            return S_OK()
Exemplo n.º 17
0
    def _kickStuckJobs(self):
        """Reschedule jobs stuck in initialization status Rescheduled, Matched"""

        message = ""

        checkTime = dateTime() - self.matchedTime * second
        result = self.jobDB.selectJobs({"Status": JobStatus.MATCHED},
                                       older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Matched status" %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                message = "Failed to reschedule %d jobs stuck in Matched status" % len(
                    result["FailedJobs"])

        checkTime = dateTime() - self.rescheduledTime * second
        result = self.jobDB.selectJobs({"Status": JobStatus.RESCHEDULED},
                                       older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Rescheduled status" %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                if message:
                    message += "\n"
                message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(
                    result["FailedJobs"])

        if message:
            return S_ERROR(message)
        return S_OK()
Exemplo n.º 18
0
 def finalize( self ):
   transEndTime = dateTime()
   regStartTime = time.time()
   res = self.__registerSuccessful()
   regSuc, regTotal = res['Value']
   regTime = time.time() - regStartTime
   if self.sourceSE and self.targetSE:
     self.__sendAccounting( regSuc, regTotal, regTime, transEndTime )
   self.__removeFailedTargets()
   self.__determineMissingSource()
   return S_OK()
Exemplo n.º 19
0
 def finalize(self):
     transEndTime = dateTime()
     regStartTime = time.time()
     res = self.__registerSuccessful()
     regSuc, regTotal = res['Value']
     regTime = time.time() - regStartTime
     if self.sourceSE and self.targetSE:
         self.__sendAccounting(regSuc, regTotal, regTime, transEndTime)
     self.__removeFailedTargets()
     self.__determineMissingSource()
     return S_OK()
Exemplo n.º 20
0
  def __kickStuckJobs( self ):
    """ Reschedule jobs stuck in initialization status Rescheduled, Matched
    """

    message = ''

    checkTime = str( dateTime() - self.matchedTime * second )
    result = self.jobDB.selectJobs( {'Status':'Matched'}, older = checkTime )
    if not result['OK']:
      self.log.error( result['Message'] )
      return result

    jobIDs = result['Value']
    if jobIDs:
      self.log.info( 'Rescheduling %d jobs stuck in Matched status' % len( jobIDs ) )
      result = self.jobDB.rescheduleJobs( jobIDs )
      if 'FailedJobs' in result:
        message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs'] )

    checkTime = str( dateTime() - self.rescheduledTime * second )
    result = self.jobDB.selectJobs( {'Status':'Rescheduled'}, older = checkTime )
    if not result['OK']:
      self.log.error( result['Message'] )
      return result

    jobIDs = result['Value']
    if jobIDs:
      self.log.info( 'Rescheduling %d jobs stuck in Rescheduled status' % len( jobIDs ) )
      result = self.jobDB.rescheduleJobs( jobIDs )
      if 'FailedJobs' in result:
        if message:
          message += '\n'
        message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs'] )

    if message:
      return S_ERROR( message )
    else:
      return S_OK()

#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#
Exemplo n.º 21
0
  def export_checkComponentLog( self, component ):
    """ Check component log for errors
    """
    componentList = []
    if '*' in component:
      if component == '*':
        result = InstallTools.getSetupComponents()
        if result['OK']:
          for ctype in ['Services', 'Agents']:
            if ctype in result['Value']:
              for sname in result['Value'][ctype]:
                for cname in result['Value'][ctype][sname]:
                  componentList.append( '/'.join( [sname, cname] ) )
    elif type( component ) in StringTypes:
      componentList = [component]
    else:
      componentList = component

    resultDict = {}
    for c in componentList:
      if not '/' in c:
        continue
      system, cname = c.split( '/' )

      startDir = InstallTools.startDir
      currentLog = startDir + '/' + system + '_' + cname + '/log/current'
      logFile = file( currentLog, 'r' )
      logLines = logFile.readlines()
      logFile.close()

      errors_1 = 0
      errors_24 = 0
      now = dateTime()
      lastError = ''
      for line in logLines:
        if "ERROR:" in line:
          fields = line.split()
          recent = False
          timeStamp = fromString( fields[0] + ' ' + fields[1] )
          if ( now - timeStamp ) < hour:
            errors_1 += 1
            recent = True
          if ( now - timeStamp ) < day:
            errors_24 += 1
            recent = True
          if recent:
            lastError = line.split( 'ERROR:' )[-1].strip()

      resultDict[c] = {'ErrorsHour':errors_1, 'ErrorsDay':errors_24, 'LastError':lastError}

    return S_OK( resultDict )
Exemplo n.º 22
0
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if "*" in component:
            if component == "*":
                result = InstallTools.getSetupComponents()
                if result["OK"]:
                    for ctype in ["Services", "Agents"]:
                        if ctype in result["Value"]:
                            for sname in result["Value"][ctype]:
                                for cname in result["Value"][ctype][sname]:
                                    componentList.append("/".join([sname, cname]))
        elif type(component) in StringTypes:
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for c in componentList:
            if not "/" in c:
                continue
            system, cname = c.split("/")

            startDir = InstallTools.startDir
            currentLog = startDir + "/" + system + "_" + cname + "/log/current"
            logFile = file(currentLog, "r")
            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ""
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    timeStamp = fromString(fields[0] + " " + fields[1])
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split("ERROR:")[-1].strip()

            resultDict[c] = {"ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError}

        return S_OK(resultDict)
Exemplo n.º 23
0
    def __kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

        message = ""

        checkTime = str(dateTime() - self.matchedTime * second)
        result = self.jobDB.selectJobs({"Status": "Matched"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                message = "Failed to reschedule %d jobs stuck in Matched status" % len(result["FailedJobs"])

        checkTime = str(dateTime() - self.rescheduledTime * second)
        result = self.jobDB.selectJobs({"Status": "Rescheduled"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                if message:
                    message += "\n"
                message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(result["FailedJobs"])

        if message:
            return S_ERROR(message)
        else:
            return S_OK()
Exemplo n.º 24
0
    def isProxyValid(self, valid=1000):
        """Check if the stored proxy is valid"""
        if not self.valid:
            result = S_ERROR("Proxy is not valid for the requested length")
            result["Value"] = 0
            return result
        delta = self.valid - dateTime()
        totalSeconds = delta.days * 86400 + delta.seconds
        if totalSeconds > valid:
            return S_OK(totalSeconds - valid)

        result = S_ERROR("Proxy is not valid for the requested length")
        result["Value"] = totalSeconds - valid
        return result
Exemplo n.º 25
0
    def _renewCloudProxy(self):
        """Takes short lived proxy from the site director and
        promotes it to a long lived proxy keeping the DIRAC group.

        :returns: True on success, false otherwise.
        :rtype: bool
        """
        if not self._cloudDN or not self._cloudGroup:
            self.log.error(
                "Could not renew cloud proxy, DN and/or Group not set.")
            return False

        proxyLifetime = int(
            self.ceParameters.get("Context_ProxyLifetime", DEF_PROXYLIFETIME))
        # only renew proxy if lifetime is less than configured lifetime
        # self.valid is a datetime
        if self.valid - dateTime() > proxyLifetime * second:
            return True
        proxyLifetime += DEF_PROXYGRACE
        proxyManager = ProxyManagerClient()
        self.log.info("Downloading proxy with cloudDN and cloudGroup: %s, %s" %
                      (self._cloudDN, self._cloudGroup))
        res = proxyManager.downloadProxy(self._cloudDN,
                                         self._cloudGroup,
                                         limited=True,
                                         requiredTimeLeft=proxyLifetime)
        if not res["OK"]:
            self.log.error("Could not download proxy", res["Message"])
            return False
        resdump = res["Value"].dumpAllToString()
        if not resdump["OK"]:
            self.log.error("Failed to dump proxy to string",
                           resdump["Message"])
            return False
        self.proxy = resdump["Value"]
        self.valid = dateTime() + proxyLifetime * second
        return True
Exemplo n.º 26
0
 def isProxyValid(self, valid=1000):
     """ Check if the stored proxy is valid
 """
     if not self.valid:
         result = S_ERROR("Proxy is not valid for the requested length")
         result["Value"] = 0
         return result
     delta = self.valid - dateTime()
     totalSeconds = delta.days * 86400 + delta.seconds
     if totalSeconds > valid:
         return S_OK(totalSeconds - valid)
     else:
         result = S_ERROR("Proxy is not valid for the requested length")
         result["Value"] = totalSeconds - valid
         return result
Exemplo n.º 27
0
    def getToken(self, force=False):
        """Get the Keystone token

        :param force: flag to force getting the token if even there is one in the cache
        :return: S_OK(token) or S_ERROR
        """

        if self.token is not None and not force:
            if self.expires and (self.expires - dateTime()).seconds > 300:
                return S_OK(self.token)

        if self.apiVersion == 2:
            result = self.__getToken2()
        else:
            result = self.__getToken3()
        return result
Exemplo n.º 28
0
  def finalize( self ):
    """ finalize FTS job

    :param self: self reference
    """
    self.__updateMetadataCache()
    transEndTime = dateTime()
    regStartTime = time.time()
    res = self.getTransferStatistics()
    transDict = res['Value']

    res = self.__registerSuccessful( transDict['transLFNs'] )

    regSuc, regTotal = res['Value']
    regTime = time.time() - regStartTime
    if self.sourceSE and self.targetSE:
      self.__sendAccounting( regSuc, regTotal, regTime, transEndTime, transDict )
    return S_OK()
Exemplo n.º 29
0
  def finalize( self ):
    """ finalize FTS job

    :param self: self reference
    """
    self.__updateMetadataCache()
    transEndTime = dateTime()
    regStartTime = time.time()
    res = self.getTransferStatistics()
    transDict = res['Value']

    res = self.__registerSuccessful( transDict['transLFNs'] )

    regSuc, regTotal = res['Value']
    regTime = time.time() - regStartTime
    if self.sourceSE and self.targetSE:
      self.__sendAccounting( regSuc, regTotal, regTime, transEndTime, transDict )
    return S_OK()
Exemplo n.º 30
0
    def _markStalledJobs(self, stalledTime):
        """ Identifies stalled jobs running or completing without update longer than stalledTime.
    """
        stalledCounter = 0
        aliveCounter = 0
        # This is the minimum time we wait for declaring a job Stalled, therefore it is safe
        checkTime = dateTime() - stalledTime * second
        checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING]
        # Only get jobs whose HeartBeat is older than the stalledTime
        result = self.jobDB.selectJobs({'Status': checkedStatuses},
                                       older=checkTime,
                                       timeStamp='HeartBeatTime')
        if not result['OK']:
            return result
        if not result['Value']:
            return S_OK()
        jobs = sorted(result['Value'])
        self.log.info(
            '%d %s jobs will be checked for being stalled, heartbeat before %s'
            % (len(jobs), ' & '.join(checkedStatuses), str(checkTime)))

        for job in jobs:
            delayTime = stalledTime
            # Add a tolerance time for some sites if required
            site = self.jobDB.getJobAttribute(job, 'site')['Value']
            if site in self.stalledJobsTolerantSites:
                delayTime += self.stalledJobsToleranceTime
            # Check if the job is really stalled
            result = self.__checkJobStalled(job, delayTime)
            if result['OK']:
                self.log.verbose('Updating status to Stalled for job %s' %
                                 (job))
                self.__updateJobStatus(job, 'Stalled')
                stalledCounter += 1
            else:
                self.log.verbose(result['Message'])
                aliveCounter += 1

        self.log.info('Total jobs: %d, Stalled jobs: %d, %s jobs: %d' %
                      (len(jobs), stalledCounter, '+'.join(checkedStatuses),
                       aliveCounter))
        return S_OK()
Exemplo n.º 31
0
    def _failSubmittingJobs(self):
        """Failed Jobs stuck in Submitting Status for a long time.
        They are due to a failed bulk submission transaction.
        """

        # Get old Submitting Jobs
        checkTime = dateTime() - self.submittingTime * second
        result = self.jobDB.selectJobs({"Status": JobStatus.SUBMITTING},
                                       older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        for jobID in result["Value"]:
            result = self._updateJobStatus(jobID, JobStatus.FAILED, force=True)
            if not result["OK"]:
                self.log.error("Failed to update job status",
                               result["Message"])
                continue

        return S_OK()
Exemplo n.º 32
0
  def __failSubmittingJobs(self):
    """ Failed Jobs stuck in Submitting Status for a long time.
        They are due to a failed bulk submission transaction.
    """

    # Get old Submitting Jobs
    checkTime = str(dateTime() - self.submittingTime * second)
    result = self.jobDB.selectJobs({'Status': 'Submitting'}, older=checkTime)
    if not result['OK']:
      self.log.error('Failed to select jobs', result['Message'])
      return result

    jobIDs = result['Value']
    if not jobIDs:
      return S_OK()

    for jobID in jobIDs:
      result = self.__updateJobStatus(jobID, 'Failed')
      if not result['OK']:
        self.log.error('Failed to update job status', result['Message'])
        continue

    return S_OK()
Exemplo n.º 33
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tags']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            queueTags = self.queueDict[queue]['ParametersDict']['Tags']
            siteMask = siteName in siteMaskList
            processorTags = []

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = processorTags
            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors.keys():

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    lastUpdateTime = dateTime(
                    ) - self.pilotWaitingTime * second
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None, lastUpdateTime)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                               ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)
                    httpProxy = self.queueDict[queue]['ParametersDict'].get(
                        'HttpProxy', '')

                    result = self.getExecutable(queue, pilotsToSubmit,
                                                bundleProxy, httpProxy,
                                                jobExecDir, processors)
                    if not result['OK']:
                        return result

                    executable, pilotSubmissionChunk = result['Value']
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if result.has_key('PilotStampDict'):
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if not tqDict.has_key(tqID):
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, '', stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Exemplo n.º 34
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools}
        if self.vo:
            tqDict["Community"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result["Value"]

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            ceType = self.queueDict[queue]["CEType"]
            queueName = self.queueDict[queue]["QueueName"]
            siteName = self.queueDict[queue]["Site"]
            siteMask = siteName in siteMaskList

            if "CPUTime" in self.queueDict[queue]["ParametersDict"]:
                queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            self.proxy = result["Value"]
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result["OK"]:
                self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"]))
                continue
            ceInfoDict = result["CEInfoDict"]
            self.log.info(
                "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d"
                % (
                    ceName,
                    queueName,
                    ceInfoDict["WaitingJobs"],
                    ceInfoDict["RunningJobs"],
                    ceInfoDict["SubmittedJobs"],
                    ceInfoDict["MaxTotalJobs"],
                )
            )

            totalSlots = result["Value"]

            ceDict = ce.getParameterDict()
            ceDict["GridCE"] = ceName
            if not siteMask and "Site" in ceDict:
                self.log.info("Site not in the mask %s" % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict["Site"]
            if self.vo:
                ceDict["Community"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            # This is a hack to get rid of !
            ceDict["SubmitPool"] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.info("No matching TQs found")
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime
                )
                if not result["OK"]:
                    self.log.error("Failed to get Number of Waiting pilots", result["Message"])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result["Value"]
                    self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots)

            pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info(
                "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d"
                % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit)
            )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get("BundleProxy", False)
                jobExecDir = ""
                if ceType == "CREAM":
                    jobExecDir = "."
                jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir)
                httpProxy = self.queueDict[queue].get("HttpProxy", "")

                result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir)
                if not result["OK"]:
                    return result

                executable, pilotSubmissionChunk = result["Value"]
                result = ce.submitJob(executable, "", pilotSubmissionChunk)
                os.unlink(executable)
                if not result["OK"]:
                    self.log.error("Failed submission to queue %s:\n" % queue, result["Message"])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result["Value"]
                self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key("PilotStampDict"):
                    stampDict = result["PilotStampDict"]
                tqPriorityList = []
                sumPriority = 0.0
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]["Priority"]
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict
                    )
                    if not result["OK"]:
                        self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot,
                            "Submitted",
                            ceName,
                            "Successfully submitted by the SiteDirector",
                            siteName,
                            queueName,
                        )
                        if not result["OK"]:
                            self.log.error("Failed to set pilot status: ", result["Message"])
                            continue

        return S_OK()
Exemplo n.º 35
0
  def sendAccounting( self, jobID ):
    """Send WMS accounting data for the given job
    """

    accountingReport = Job()

    result = self.jobDB.getJobAttributes( jobID )
    if not result['OK']:
      return result
    jobDict = result['Value']

    result = self.logDB.getJobLoggingInfo( jobID )
    if not result['OK']:
      logList = []
    else:
      logList = result['Value']

    startTime = jobDict['StartExecTime']
    endTime = ''

    if not startTime or startTime == 'None':
      for status, minor, app, stime, source in logList:
        if status == 'Running':
          startTime = stime
          break
      for status, minor, app, stime, source in logList:
        if status == 'Stalled':
          endTime = stime
      if not startTime or startTime == 'None':
        startTime = jobDict['SubmissionTime']

    if type( startTime ) in types.StringTypes:
      startTime = fromString( startTime )


    result = self.logDB.getJobLoggingInfo( jobID )
    if not result['OK']:
      endTime = dateTime()
    else:
      for status, minor, app, stime, source in result['Value']:
        if status == 'Stalled':
          endTime = stime
          break
    if not endTime:
      endTime = dateTime()

    if type( endTime ) in types.StringTypes:
      endTime = fromString( endTime )

    result = self.jobDB.getHeartBeatData( jobID )

    lastCPUTime = 0
    lastWallTime = 0
    lastHeartBeatTime = jobDict['StartExecTime']
    if result['OK']:
      for name, value, heartBeatTime in result['Value']:
        if 'CPUConsumed' == name:
          try:
            value = int( float( value ) )
            if value > lastCPUTime:
              lastCPUTime = value
          except:
            pass
        if 'WallClockTime' == name:
          try:
            value = int( float( value ) )
            if value > lastWallTime:
              lastWallTime = value
          except:
            pass
        if heartBeatTime > lastHeartBeatTime:
          lastHeartBeatTime = heartBeatTime

    accountingReport.setStartTime( startTime )
    accountingReport.setEndTime()
    # execTime = toEpoch( endTime ) - toEpoch( startTime )
    #Fill the accounting data
    acData = { 'Site' : jobDict['Site'],
               'User' : jobDict['Owner'],
               'UserGroup' : jobDict['OwnerGroup'],
               'JobGroup' : jobDict['JobGroup'],
               'JobType' : jobDict['JobType'],
               'JobClass' : jobDict['JobSplitType'],
               'ProcessingType' : 'unknown',
               'FinalMajorStatus' : 'Failed',
               'FinalMinorStatus' : 'Stalled',
               'CPUTime' : lastCPUTime,
               'NormCPUTime' : 0.0,
               'ExecTime' : lastWallTime,
               'InputDataSize' : 0.0,
               'OutputDataSize' : 0.0,
               'InputDataFiles' : 0,
               'OutputDataFiles' : 0,
               'DiskSpace' : 0.0,
               'InputSandBoxSize' : 0.0,
               'OutputSandBoxSize' : 0.0,
               'ProcessedEvents' : 0
             }
    self.log.verbose( 'Accounting Report is:' )
    self.log.verbose( acData )
    accountingReport.setValuesFromDict( acData )

    result = accountingReport.commit()
    if result['OK']:
      self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' )
    else:
      self.log.warn( 'Failed to send accounting report for job %d' % int( jobID ) )
      self.log.error( result['Message'] )
    return result
Exemplo n.º 36
0
    def __getToken3(self):
        """Get the Keystone token for the version v3 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        domain = self.parameters.get("Domain", "Default")
        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        appcred_file = self.parameters.get("Appcred")
        authDict = {}
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["password"],
                        "password": {
                            "user": {
                                "name": user,
                                "domain": {
                                    "name": domain
                                },
                                "password": password
                            }
                        },
                    }
                }
            }
        elif self.parameters.get("Auth") == "voms":
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["mapped"],
                        "mapped": {
                            "voms": True,
                            "identity_provider": "egi.eu",
                            "protocol": "mapped"
                        },
                    }
                }
            }
            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")
        elif appcred_file:
            # The application credentials are stored in a file of the format:
            # id secret
            ac_fd = open(appcred_file, "r")
            auth_info = ac_fd.read()
            auth_info = auth_info.strip()
            ac_id, ac_secret = auth_info.split(" ", 1)
            ac_fd.close()
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["application_credential"],
                        "application_credential": {
                            "id": ac_id,
                            "secret": ac_secret
                        },
                    }
                }
            }
        else:
            return S_ERROR("No valid credentials provided")

        # appcred includes the project scope binding in the credential itself
        if self.project and not appcred_file:
            authDict["auth"]["scope"] = {
                "project": {
                    "domain": {
                        "name": domain
                    },
                    "name": self.project
                }
            }

        gLogger.debug("Request token with auth arguments: %s and body %s" %
                      (str(authArgs), str(authDict)))

        url = "%s/auth/tokens" % self.url
        try:
            result = requests.post(url,
                                   headers={
                                       "Content-Type": "application/json",
                                       "Accept": "application/json",
                                   },
                                   json=authDict,
                                   verify=self.caPath,
                                   **authArgs)

        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        if result.status_code not in [200, 201, 202, 203, 204]:
            return S_ERROR("Failed to get keystone token: %s" % result.text)

        try:
            self.token = result.headers["X-Subject-Token"]
        except Exception as exc:
            return S_ERROR("Failed to get keystone token: %s" % str(exc))

        output = result.json()

        expires = fromString(
            str(output["token"]["expires_at"]).replace("T",
                                                       " ").replace("Z", ""))
        issued = fromString(
            str(output["token"]["issued_at"]).replace("T",
                                                      " ").replace("Z", ""))
        self.expires = dateTime() + (expires - issued)

        if "project" in output["token"]:
            if output["token"]["project"]["name"] == self.project:
                self.projectID = output["token"]["project"]["id"]

        if "catalog" in output["token"]:
            for service in output["token"]["catalog"]:
                if service["type"] == "compute":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.computeURL = str(endpoint["url"])

                elif service["type"] == "image":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.imageURL = str(endpoint["url"])

                elif service["type"] == "network":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.networkURL = str(endpoint["url"])

        return S_OK(self.token)
Exemplo n.º 37
0
  def export_checkComponentLog(self, component):
    """ Check component log for errors
    """
    componentList = []
    if '*' in component:
      if component == '*':
        result = gComponentInstaller.getSetupComponents()
        if result['OK']:
          for ctype in ['Services', 'Agents', 'Executors']:
            if ctype in result['Value']:
              for sname in result['Value'][ctype]:
                for cname in result['Value'][ctype][sname]:
                  componentList.append('/'.join([sname, cname]))
    elif isinstance(component, basestring):
      componentList = [component]
    else:
      componentList = component

    resultDict = {}
    for comp in componentList:
      if '/' not in comp:
        continue
      system, cname = comp.split('/')

      startDir = gComponentInstaller.startDir
      currentLog = startDir + '/' + system + '_' + cname + '/log/current'
      try:
        logFile = file(currentLog, 'r')
      except IOError as err:
        gLogger.error("File does not exists:", currentLog)
        resultDict[comp] = {'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err)}
        continue

      logLines = logFile.readlines()
      logFile.close()

      errors_1 = 0
      errors_24 = 0
      now = dateTime()
      lastError = ''
      for line in logLines:
        if "ERROR:" in line:
          fields = line.split()
          recent = False
          if len(fields) < 2:  # if the line contains only one word
            lastError = line.split('ERROR:')[-1].strip()
            continue
          timeStamp = fromString(fields[0] + ' ' + fields[1])
          if not timeStamp:  # if the timestamp is missing in the log
            lastError = line.split('ERROR:')[-1].strip()
            continue
          if (now - timeStamp) < hour:
            errors_1 += 1
            recent = True
          if (now - timeStamp) < day:
            errors_24 += 1
            recent = True
          if recent:
            lastError = line.split('ERROR:')[-1].strip()

      resultDict[comp] = {'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError}

    return S_OK(resultDict)
Exemplo n.º 38
0
 def setProxy(self, proxy, valid=0):
     """ Set proxy for this instance
 """
     self.proxy = proxy
     self.valid = dateTime() + second * valid
Exemplo n.º 39
0
    def _failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
    """
        # Only get jobs that have been Stalled for long enough
        checkTime = dateTime() - failedTime * second
        result = self.jobDB.selectJobs({'Status': JobStatus.STALLED},
                                       older=checkTime)
        if not result['OK']:
            return result
        jobs = result['Value']

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running",
                                'Stalling for more than %d sec' % failedTime)

        if jobs:
            self.log.info(
                '%d jobs Stalled before %s will be checked for failure' %
                (len(jobs), str(checkTime)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result['OK']:
                    self.log.error('Failed to get pilot status',
                                   result['Message'])
                    continue
                pilotStatus = result['Value']
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:
                    # Verify that there was no sign of life for long enough
                    result = self.__getLatestUpdateTime(job)
                    if not result['OK']:
                        self.log.error('Failed to get job update time',
                                       result['Message'])
                        continue
                    elapsedTime = toEpoch() - result['Value']
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    self.__sendKillCommand(job)
                    self.__updateJobStatus(job, JobStatus.FAILED, setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({
                'Status': JobStatus.FAILED,
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)
Exemplo n.º 40
0
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if '*' in component:
            if component == '*':
                result = gComponentInstaller.getSetupComponents()
                if result['OK']:
                    for ctype in ['Services', 'Agents', 'Executors']:
                        if ctype in result['Value']:
                            for sname in result['Value'][ctype]:
                                for cname in result['Value'][ctype][sname]:
                                    componentList.append('/'.join(
                                        [sname, cname]))
        elif isinstance(component, basestring):
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for comp in componentList:
            if '/' not in comp:
                continue
            system, cname = comp.split('/')

            startDir = gComponentInstaller.startDir
            currentLog = startDir + '/' + system + '_' + cname + '/log/current'
            try:
                logFile = file(currentLog, 'r')
            except IOError as err:
                gLogger.error("File does not exists:", currentLog)
                resultDict[comp] = {
                    'ErrorsHour': -1,
                    'ErrorsDay': -1,
                    'LastError': currentLog + '::' + repr(err)
                }
                continue

            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ''
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    if len(fields) < 2:  # if the line contains only one word
                        lastError = line.split('ERROR:')[-1].strip()
                        continue
                    timeStamp = fromString(fields[0] + ' ' + fields[1])
                    if not timeStamp:  # if the timestamp is missing in the log
                        lastError = line.split('ERROR:')[-1].strip()
                        continue
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split('ERROR:')[-1].strip()

            resultDict[comp] = {
                'ErrorsHour': errors_1,
                'ErrorsDay': errors_24,
                'LastError': lastError
            }

        return S_OK(resultDict)
Exemplo n.º 41
0
    def __getToken2(self):
        """Get the Keystone token for the version v2 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "passwordCredentials": {
                        "username": user,
                        "password": password
                    }
                }
            }
            if self.project:
                authDict["auth"]["tenantName"] = self.project
        elif self.parameters.get("Auth") == "voms":
            authDict = {"auth": {"voms": True}}
            if self.project:
                authDict["auth"]["tenantName"] = self.project

            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")

        try:
            result = requests.post(
                "%s/tokens" % self.url,
                headers={"Content-Type": "application/json"},
                json=authDict,
                verify=self.caPath,
                **authArgs)
        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        output = result.json()

        if result.status_code in [400, 401]:
            message = "None"
            if "error" in output:
                message = output["error"].get("message")
            return S_ERROR("Authorization error: %s" % message)

        self.token = str(output["access"]["token"]["id"])
        expires = fromString(
            str(output["access"]["token"]["expires"]).replace("T",
                                                              " ").replace(
                                                                  "Z", ""))
        issued = fromString(
            str(output["access"]["token"]["issued_at"]).replace("T",
                                                                " ").replace(
                                                                    "Z", ""))
        self.expires = dateTime() + (expires - issued)

        self.projectID = output["access"]["token"]["tenant"]["id"]

        for endpoint in output["access"]["serviceCatalog"]:
            if endpoint["type"] == "compute":
                self.computeURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "image":
                self.imageURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "network":
                self.networkURL = str(endpoint["endpoints"][0]["publicURL"])
        return S_OK(self.token)
Exemplo n.º 42
0
    def __obtainWMSJobIDs(self, transformation, fileDict, selectDelay,
                          wmsStatusList):
        """ Group files by the corresponding WMS jobIDs, check the corresponding
        jobs have not been updated for the delay time.  Can't get into any
        mess because we start from files only in MaxReset / Assigned and check
        corresponding jobs.  Mixtures of files for jobs in MaxReset and Assigned
        statuses only possibly include some files in Unused status (not Processed
        for example) that will not be touched.
    """
        taskIDList = sorted(
            set(taskID for taskID, _status in fileDict.values()))
        self.transLogger.verbose(
            "The following %d task IDs correspond to the selected files:\n%s" %
            (len(taskIDList), ', '.join(str(taskID) for taskID in taskIDList)))

        jobFileDict = {}
        olderThan = dateTime() - datetime.timedelta(hours=selectDelay)

        res = self.transClient.getTransformationTasks(
            condDict={
                'TransformationID': transformation,
                'TaskID': taskIDList
            },
            older=olderThan,
            timeStamp='LastUpdateTime')
        if not res['OK']:
            self.transLogger.error("getTransformationTasks returned an error",
                                   '%s' % res['Message'])
            return res

        mandatoryKeys = {
            'TaskID', 'ExternalID', 'LastUpdateTime', 'ExternalStatus'
        }
        for taskDict in res['Value']:
            missingKey = mandatoryKeys - set(taskDict)
            if missingKey:
                for key in missingKey:
                    self.transLogger.warn(
                        'Missing key %s for job dictionary:\n\t%s' %
                        (key, str(taskDict)))
                continue

            taskID = taskDict['TaskID']
            wmsID = taskDict['ExternalID']
            wmsStatus = taskDict['ExternalStatus']

            if not int(wmsID):
                self.transLogger.verbose(
                    'TaskID %s: status is %s (jobID = %s) so will not recheck with WMS'
                    % (taskID, wmsStatus, wmsID))
                continue

            # Exclude jobs not having appropriate WMS status - have to trust that production management status is correct
            if wmsStatus not in wmsStatusList:
                self.transLogger.verbose(
                    'Job %s is in status %s, not in %s so will be ignored' %
                    (wmsID, wmsStatus, ', '.join(wmsStatusList)))
                continue

            # Must map unique files -> jobs in expected state
            jobFileDict[wmsID] = [
                lfn for lfn, (tID, _st) in fileDict.iteritems()
                if int(tID) == int(taskID)
            ]

            self.transLogger.info(
                'Found %d files for taskID %s, jobID %s (%s), last update %s' %
                (len(jobFileDict[wmsID]), taskID, wmsID, wmsStatus,
                 taskDict['LastUpdateTime']))

        return S_OK(jobFileDict)
Exemplo n.º 43
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: \n%s' %
                    (queue, result['Message']))
                continue
            ceInfoDict = result['CEInfoDict']
            self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                           ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                             ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

            totalSlots = result['Value']

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.info('No matching TQs found')
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                                    ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        return S_OK()
Exemplo n.º 44
0
  def obtainWMSJobIDs(self, transformation, fileDict, selectDelay, wmsStatusList):
    """ Group files by the corresponding WMS jobIDs, check the corresponding
        jobs have not been updated for the delay time.  Can't get into any 
        mess because we start from files only in MaxReset / Assigned and check
        corresponding jobs.  Mixtures of files for jobs in MaxReset and Assigned 
        statuses only possibly include some files in Unused status (not Processed 
        for example) that will not be touched.
    """
    prodJobIDs = uniqueElements(fileDict.values())
    self.log.info('The following %s production jobIDs apply to the selected files:\n%s' % (len(prodJobIDs), prodJobIDs))

    jobFileDict = {}
    condDict = {'TransformationID' : transformation, self.taskIDName : prodJobIDs}
    delta = datetime.timedelta( hours = selectDelay )
    now = dateTime()
    olderThan = now-delta

    res = self.prodDB.getTransformationTasks(condDict = condDict, older = olderThan,
                                             timeStamp = 'LastUpdateTime', inputVector = True)
    self.log.debug(res)
    if not res['OK']:
      self.log.error('getTransformationTasks returned an error:\n%s')
      return res
    
    for jobDict in res['Value']:
      missingKey = False
      for key in [self.taskIDName, self.externalID, 'LastUpdateTime', self.externalStatus, 'InputVector']:
        if not jobDict.has_key(key):
          self.log.info('Missing key %s for job dictionary, the following is available:\n%s' % (key, jobDict))
          missingKey = True
          continue
      
      if missingKey:
        continue
        
      job = jobDict[self.taskIDName]
      wmsID = jobDict[self.externalID]
      lastUpdate = jobDict['LastUpdateTime']
      wmsStatus = jobDict[self.externalStatus]
      jobInputData = jobDict['InputVector']
      jobInputData = [lfn.replace('LFN:','') for lfn in jobInputData.split(';')]
      
      if not int(wmsID):
        self.log.info('Prod job %s status is %s (ID = %s) so will not recheck with WMS' %(job, wmsStatus, wmsID))
        continue
      
      self.log.info('Job %s, prod job %s last update %s, production management system status %s' % (wmsID, job, lastUpdate, wmsStatus))
      #Exclude jobs not having appropriate WMS status - have to trust that production management status is correct        
      if not wmsStatus in wmsStatusList:
        self.log.info('Job %s is in status %s, not %s so will be ignored' % (wmsID, wmsStatus, string.join(wmsStatusList, ', ')))
        continue
        
      finalJobData = []
      #Must map unique files -> jobs in expected state
      for lfn,prodID in fileDict.items():
        if int(prodID) == int(job):
          finalJobData.append(lfn)
      
      self.log.info('Found %s files for job %s' % (len(finalJobData), job))    
      jobFileDict[wmsID] = finalJobData
 
    return S_OK(jobFileDict)
Exemplo n.º 45
0
    def export_checkComponentLog(self, component):
        """Check component log for errors"""
        componentList = []
        if "*" in component:
            if component == "*":
                result = gComponentInstaller.getSetupComponents()
                if result["OK"]:
                    for ctype in ["Services", "Agents", "Executors"]:
                        if ctype in result["Value"]:
                            for sname in result["Value"][ctype]:
                                for cname in result["Value"][ctype][sname]:
                                    componentList.append("/".join(
                                        [sname, cname]))
        elif isinstance(component, six.string_types):
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for comp in componentList:
            if "/" not in comp:
                continue
            system, cname = comp.split("/")

            startDir = gComponentInstaller.startDir
            currentLog = startDir + "/" + system + "_" + cname + "/log/current"
            try:
                with open(currentLog, "r") as logFile:
                    logLines = logFile.readlines()
            except IOError as err:
                gLogger.error("File does not exists:", currentLog)
                resultDict[comp] = {
                    "ErrorsHour": -1,
                    "ErrorsDay": -1,
                    "LastError": currentLog + "::" + repr(err)
                }
                continue

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ""
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    if len(fields) < 2:  # if the line contains only one word
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    timeStamp = fromString(fields[0] + " " + fields[1])
                    if not timeStamp:  # if the timestamp is missing in the log
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split("ERROR:")[-1].strip()

            resultDict[comp] = {
                "ErrorsHour": errors_1,
                "ErrorsDay": errors_24,
                "LastError": lastError
            }

        return S_OK(resultDict)
Exemplo n.º 46
0
  def submitJobs(self):
    """ Go through defined computing elements and submit jobs if necessary
    """

    queues = self.queueDict.keys()

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = {'Setup': setup,
              'CPUTime': 9999999,
              'SubmitPool': self.defaultSubmitPools}
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms(self.platforms)
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tags = []
    for queue in queues:
      tags += self.queueDict[queue]['ParametersDict']['Tag']
    tqDict['Tag'] = list(set(tags))

    self.log.verbose('Checking overall TQ availability with requirements')
    self.log.verbose(tqDict)

    rpcMatcher = RPCClient("WorkloadManagement/Matcher")
    result = rpcMatcher.getMatchingTaskQueues(tqDict)
    if not result['OK']:
      return result
    if not result['Value']:
      self.log.verbose('No Waiting jobs suitable for the director')
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add(site)
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add(site)
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    self.log.info(tqIDList)
    result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList,
                                        'Status': WAITING_PILOT_STATUS},
                                       None)
    tagWaitingPilots = 0
    if result['OK']:
      tagWaitingPilots = result['Value']
    self.log.info('Total %d jobs in %d task queues with %d waiting pilots' %
                  (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
    self.log.info('Queues: ', self.queueDict.keys())
    # if tagWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    result = self.siteClient.getUsableSites()
    if not result['OK']:
      return result
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle(queues)
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount))
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      queueTags = self.queueDict[queue]['ParametersDict']['Tag']
      siteMask = siteName in siteMaskList
      processorTags = []

      # Check the status of the Site
      result = self.siteClient.getUsableSites(siteName)
      if not result['OK']:
        self.log.error("Can not get the status of site %s: %s" %
                       (siteName, result['Message']))
        continue
      if siteName not in result.get('Value', []):
        self.log.info("site %s is not active" % siteName)
        continue

      if self.rssFlag:
        # Check the status of the ComputingElement
        result = self.rssClient.getElementStatus(ceName, "ComputingElement")
        if not result['OK']:
          self.log.error("Can not get the status of computing element",
                         " %s: %s" % (siteName, result['Message']))
          continue
        if result['Value']:
          # get the value of the status
          result = result['Value'][ceName]['all']

        if result not in ('Active', 'Degraded'):
          self.log.verbose(
              "Skipping computing element %s at %s: resource not usable" % (ceName, siteName))
          continue

      for tag in queueTags:
        if re.match(r'^[0-9]+Processors$', tag):
          processorTags.append(tag)
      if 'WholeNode' in queueTags:
        processorTags.append('WholeNode')

      if not anySite and siteName not in jobSites:
        self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName))
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName))
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
        queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime'])
      else:
        self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue)
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict['GridCE'] = ceName
      # if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms(platform)
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      ceDict['Tag'] = queueTags
      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues(ceDict)
      if not result['OK']:
        self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message'])
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose('No matching TQs found for %s' % queue)
        continue

      matchedQueues += 1
      totalTQJobs = 0
      totalTQJobsByProcessors = {}
      tqIDList = taskQueueDict.keys()
      tqIDListByProcessors = {}
      for tq in taskQueueDict:
        if 'Tags' not in taskQueueDict[tq]:
          # skip non multiprocessor tqs
          continue
        for tag in taskQueueDict[tq]['Tags']:
          if tag in processorTags:
            tqIDListByProcessors.setdefault(tag, [])
            tqIDListByProcessors[tag].append(tq)

            totalTQJobsByProcessors.setdefault(tag, 0)
            totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs']

        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs,
                                                                                      len(tqIDList), queue))

      queueSubmittedPilots = 0
      for tag in tqIDListByProcessors:

        self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag]))

        processors = 1

        m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
        if m:
          processors = int(m.group('processors'))
        if tag == 'WholeNode':
          processors = -1

        tagTQJobs = totalTQJobsByProcessors[tag]
        tagTqIDList = tqIDListByProcessors[tag]

        # Get the number of already waiting pilots for these task queues
        tagWaitingPilots = 0
        if self.pilotWaitingFlag:
          lastUpdateTime = dateTime() - self.pilotWaitingTime * second
          result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList,
                                              'Status': WAITING_PILOT_STATUS},
                                             None, lastUpdateTime)
          if not result['OK']:
            self.log.error('Failed to get Number of Waiting pilots', result['Message'])
            tagWaitingPilots = 0
          else:
            tagWaitingPilots = result['Value']
            self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots)
        if tagWaitingPilots >= tagTQJobs:
          self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots)
          continue

        self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots,
                                                                                         tagTQJobs, queue))

        # Get the working proxy
        cpuTime = queueCPUTime + 86400
        self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
        result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy(self.proxy, cpuTime - 60)

        # Get the number of available slots on the target site/queue
        totalSlots = self.getQueueSlots(queue, False)
        if totalSlots == 0:
          self.log.debug('%s: No slots available' % queue)
          continue

        # Note: comparing slots to job numbers is not accurate in multiprocessor case.
        #       This could lead to over submission.
        pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots))
        self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' %
                      (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit))

        # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
        pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit)

        while pilotsToSubmit > 0:
          self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue))

          bundleProxy = self.queueDict[queue].get('BundleProxy', False)
          jobExecDir = ''
          jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir)

          executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit,
                                                                bundleProxy=bundleProxy,
                                                                jobExecDir=jobExecDir,
                                                                processors=processors)
          result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors)
          # ## FIXME: The condor thing only transfers the file with some
          # ## delay, so when we unlink here the script is gone
          # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
          if ceType != 'HTCondorCE':
            os.unlink(executable)
          if not result['OK']:
            self.log.error('Failed submission to queue %s:\n' % queue, result['Message'])
            pilotsToSubmit = 0
            self.failedQueues[queue] += 1
            continue

          pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
          queueSubmittedPilots += pilotSubmissionChunk
          # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
          # task queue priorities
          pilotList = result['Value']
          self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
          totalSubmittedPilots += len(pilotList)
          self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName))
          stampDict = {}
          if 'PilotStampDict' in result:
            stampDict = result['PilotStampDict']
          tqPriorityList = []
          sumPriority = 0.
          for tq in tagTqIDList:
            sumPriority += taskQueueDict[tq]['Priority']
            tqPriorityList.append((tq, sumPriority))
          rndm = random.random() * sumPriority
          tqDict = {}
          for pilotID in pilotList:
            rndm = random.random() * sumPriority
            for tq, prio in tqPriorityList:
              if rndm < prio:
                tqID = tq
                break
            if tqID not in tqDict:
              tqDict[tqID] = []
            tqDict[tqID].append(pilotID)

          for tqID, pilotList in tqDict.items():
            result = pilotAgentsDB.addPilotTQReference(pilotList,
                                                       tqID,
                                                       self.pilotDN,
                                                       self.pilotGroup,
                                                       self.localhost,
                                                       ceType,
                                                       stampDict)
            if not result['OK']:
              self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message'])
              continue
            for pilot in pilotList:
              result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName,
                                                    'Successfully submitted by the SiteDirector',
                                                    siteName, queueName)
              if not result['OK']:
                self.log.error('Failed to set pilot status: ', result['Message'])
                continue

    self.log.info(
        "%d pilots submitted in total in this cycle, %d matched queues" %
        (totalSubmittedPilots, matchedQueues))
    return S_OK()
Exemplo n.º 47
0
def main():
    Script.registerSwitch(
        "C", "country",
        "Sort site names by country postfix (i.e. LCG.IHEP.cn, LCG.IN2P3.fr, LCG.IHEP.su)",
        sortBy)
    Script.registerSwitch("R", "reverse", "Reverse the sort order", isReverse)
    # Registering arguments will automatically add their description to the help menu
    Script.registerArgument([
        "Section: Name of the subsection in '/Resources/Sites/' for sort (i.e. LCG DIRAC)"
    ],
                            mandatory=False)

    Script.parseCommandLine(ignoreErrors=True)
    args = Script.getPositionalArgs()

    result = getProxyInfo()
    if not result["OK"]:
        gLogger.error("Failed to get proxy information", result["Message"])
        DIRACExit(2)
    proxy = result["Value"]
    if proxy["secondsLeft"] < 1:
        gLogger.error("Your proxy has expired, please create new one")
        DIRACExit(2)
    group = proxy["group"]
    if "CSAdministrator" not in getPropertiesForGroup(group):
        gLogger.error(
            "You must be CSAdministrator user to execute this script")
        gLogger.notice(
            "Please issue 'dirac-proxy-init -g [group with CSAdministrator Property]'"
        )
        DIRACExit(2)

    cs = CSAPI()
    result = cs.getCurrentCFG()
    if not result["OK"]:
        gLogger.error("Failed to get copy of CS", result["Message"])
        DIRACExit(2)
    cfg = result["Value"]

    if not cfg.isSection("Resources"):
        gLogger.error("Section '/Resources' is absent in CS")
        DIRACExit(2)

    if not cfg.isSection("Resources/Sites"):
        gLogger.error("Subsection '/Resources/Sites' is absent in CS")
        DIRACExit(2)

    if args and len(args) > 0:
        resultList = args[:]
    else:
        resultList = cfg["Resources"]["Sites"].listSections()

    hasRun = False
    isDirty = False
    for i in resultList:
        if not cfg.isSection("Resources/Sites/%s" % i):
            gLogger.error("Subsection /Resources/Sites/%s does not exists" % i)
            continue
        hasRun = True
        if SORTBYNAME:
            dirty = cfg["Resources"]["Sites"][i].sortAlphabetically(
                ascending=not REVERSE)
        else:
            dirty = cfg["Resources"]["Sites"][i].sortByKey(key=country,
                                                           reverse=REVERSE)
        if dirty:
            isDirty = True

    if not hasRun:
        gLogger.notice(
            "Failed to find suitable subsections with site names to sort")
        DIRACExit(0)

    if not isDirty:
        gLogger.notice("Nothing to do, site names are already sorted")
        DIRACExit(0)

    timestamp = toString(dateTime())
    stamp = "Site names are sorted by %s script at %s" % (Script.scriptName,
                                                          timestamp)
    cs.setOptionComment("/Resources/Sites", stamp)

    result = cs.commit()
    if not result["OK"]:
        gLogger.error("Failed to commit changes to CS", result["Message"])
        DIRACExit(2)
    gLogger.notice("Site names are sorted and committed to CS")
    DIRACExit(0)
Exemplo n.º 48
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tqDict['Tag'] = []
    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add( site )
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add( site )
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                          'Status': WAITING_PILOT_STATUS },
                                           None )
    totalWaitingPilots = 0
    if result['OK']:
      totalWaitingPilots = result['Value']
    self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) )
    #if totalWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle( queues )
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) )
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      siteMask = siteName in siteMaskList

      if not anySite and siteName not in jobSites:
        self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) )
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) )
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      #if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools
      
      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found for %s' % queue )
        continue

      matchedQueues += 1
      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) )

      # Get the number of already waiting pilots for these task queues
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                              None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )
      if totalWaitingPilots >= totalTQJobs:
        self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots )
        continue

      self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      totalSlots = self.__getQueueSlots( queue )
      if totalSlots == 0:
        self.log.debug( '%s: No slots available' % queue )
        continue

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                              ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          self.failedQueues[queue] += 1
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.queueSlots[queue]['AvailableSlots'] -= len( pilotList )
        totalSubmittedPilots += len( pilotList )
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random() * sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                      tqID,
                                                      self.pilotDN,
                                                      self.pilotGroup,
                                                      self.localhost,
                                                      ceType,
                                                      '',
                                                      stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) )
    return S_OK()
Exemplo n.º 49
0
    def execute(self):
        """The main agent execution method"""
        # Now we are getting what's going to be checked
        futures = []

        # 1) Queueing the jobs that might be marked Stalled
        # This is the minimum time we wait for declaring a job Stalled, therefore it is safe
        checkTime = dateTime() - self.stalledTime * second
        checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING]
        # Only get jobs whose HeartBeat is older than the stalledTime
        result = self.jobDB.selectJobs({"Status": checkedStatuses},
                                       older=checkTime,
                                       timeStamp="HeartBeatTime")
        if not result["OK"]:
            self.log.error(
                "Issue selecting %s jobs" % " & ".join(checkedStatuses),
                result["Message"])
        if result["Value"]:
            jobs = sorted(result["Value"])
            self.log.info(
                "%s jobs will be checked for being stalled" %
                " & ".join(checkedStatuses),
                "(n=%d, heartbeat before %s)" % (len(jobs), str(checkTime)),
            )
            for job in jobs:
                future = self.threadPoolExecutor.submit(
                    self._execute, "%s:_markStalledJobs" % job)
                futures.append(future)

        # 2) fail Stalled Jobs
        result = self.jobDB.selectJobs({"Status": JobStatus.STALLED})
        if not result["OK"]:
            self.log.error("Issue selecting Stalled jobs", result["Message"])
        if result["Value"]:
            jobs = sorted(result["Value"])
            self.log.info("Jobs Stalled will be checked for failure",
                          "(n=%d)" % len(jobs))
            for job in jobs:
                future = self.threadPoolExecutor.submit(
                    self._execute, "%s:_failStalledJobs" % job)
                futures.append(future)

        # 3) Send accounting
        for minor in self.minorStalledStatuses:
            result = self.jobDB.selectJobs({
                "Status": JobStatus.FAILED,
                "MinorStatus": minor,
                "AccountedFlag": "False"
            })
            if not result["OK"]:
                self.log.error("Issue selecting jobs for accounting",
                               result["Message"])
            if result["Value"]:
                jobs = result["Value"]
                self.log.info("Stalled jobs will be Accounted",
                              "(n=%d)" % (len(jobs)))
                for job in jobs:
                    future = self.threadPoolExecutor.submit(
                        self._execute, "%s:_sendAccounting" % job)
                    futures.append(future)

        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                self.log.error("_execute generated an exception: %s" % exc)

        # From here on we don't use the threads

        # 4) Fail submitting jobs
        result = self._failSubmittingJobs()
        if not result["OK"]:
            self.log.error("Failed to process jobs being submitted",
                           result["Message"])

        # 5) Kick stuck jobs
        result = self._kickStuckJobs()
        if not result["OK"]:
            self.log.error("Failed to kick stuck jobs", result["Message"])

        return S_OK()
Exemplo n.º 50
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1 

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Exemplo n.º 51
0
for i in resultList:
    if not cfg.isSection("Resources/Sites/%s" % i):
        gLogger.error("Subsection /Resources/Sites/%s does not exists" % i)
        continue
    hasRun = True
    if SORTBYNAME:
        dirty = cfg["Resources"]["Sites"][i].sortAlphabetically(ascending=not REVERSE)
    else:
        dirty = cfg["Resources"]["Sites"][i].sortByKey(key=country, reverse=REVERSE)
    if dirty:
        isDirty = True

if not hasRun:
    gLogger.notice("Failed to find suitable subsections with site names to sort")
    DIRAC.exit(0)

if not isDirty:
    gLogger.notice("Nothing to do, site names are already sorted")
    DIRAC.exit(0)

timestamp = toString(dateTime())
stamp = "Site names are sorted by %s script at %s" % (Script.scriptName, timestamp)
cs.setOptionComment("/Resources/Sites", stamp)

result = cs.commit()
if not result["OK"]:
    gLogger.error("Failed to commit changes to CS", result["Message"])
    DIRAC.exit(2)
gLogger.notice("Site names are sorted and committed to CS")
DIRAC.exit(0)
Exemplo n.º 52
0
 def setProxy(self, proxy, valid=0):
     """ Set proxy for this instance
 """
     self.proxy = proxy
     self.valid = dateTime() + second * valid
Exemplo n.º 53
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()