Пример #1
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Пример #2
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()
Пример #3
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.removeStatusDelay = {'Done':7,
                              'Killed':1,
                              'Failed':7 }

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    
    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()
    
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Пример #4
0
class JobCleaningAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        self.prod_types = self.am_getOption(
            "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"]
        )
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", "))
        )
        self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200)
        self.jobByJob = self.am_getOption("JobByJob", True)
        self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0)
        return S_OK()

    def __getAllowedJobTypes(self):
        # Get valid jobTypes
        result = self.jobDB.getDistinctJobAttributes("JobType")
        if not result["OK"]:
            return result
        cleanJobTypes = []
        for jobType in result["Value"]:
            if jobType not in self.prod_types:
                cleanJobTypes.append(jobType)
        self.log.notice("JobTypes to clean %s" % cleanJobTypes)
        return S_OK(cleanJobTypes)

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """
        # Delete jobs in "Deleted" state
        result = self.removeJobsByStatus({"Status": "Deleted"})
        if not result["OK"]:
            return result
        # Get all the Job types that can be cleaned
        result = self.__getAllowedJobTypes()
        if not result["OK"]:
            return result
        baseCond = {"JobType": result["Value"]}
        # Remove jobs with final status
        for status in REMOVE_STATUS_DELAY:
            delay = REMOVE_STATUS_DELAY[status]
            condDict = dict(baseCond)
            condDict["Status"] = status
            delTime = str(Time.dateTime() - delay * Time.day)
            result = self.removeJobsByStatus(condDict, delTime)
            if not result["OK"]:
                gLogger.warn("Failed to remove jobs in status %s" % status)
        return S_OK()

    def removeJobsByStatus(self, condDict, delay=False):
        """ Remove deleted jobs
    """
        if delay:
            gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay))
            result = self.jobDB.selectJobs(condDict, older=delay)
        else:
            gLogger.verbose("Removing jobs with %s " % condDict)
            result = self.jobDB.selectJobs(condDict)

        if not result["OK"]:
            return result

        jobList = result["Value"]
        if len(jobList) > self.maxJobsAtOnce:
            jobList = jobList[: self.maxJobsAtOnce]
        if not jobList:
            return S_OK()

        self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict))

        count = 0
        error_count = 0
        result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
        if not result["OK"]:
            gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"])

        if self.jobByJob:
            for jobID in jobList:
                resultJobDB = self.jobDB.removeJobFromDB(jobID)
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultJobDB["OK"]:
                    gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"])
                    error_count += 1
                elif not resultTQ["OK"]:
                    gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, result["Message"])
                    error_count += 1
                else:
                    count += 1
                if self.throttlingPeriod:
                    time.sleep(self.throttlingPeriod)
        else:
            result = self.jobDB.removeJobFromDB(jobList)
            if not result["OK"]:
                gLogger.error("Failed to delete %d jobs from JobDB" % len(jobList))
            else:
                gLogger.info("Deleted %d jobs from JobDB" % len(jobList))

            for jobID in jobList:
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultTQ["OK"]:
                    gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, resultTQ["Message"])
                    error_count += 1
                else:
                    count += 1

        if count > 0 or error_count > 0:
            gLogger.info("Deleted %d jobs from JobDB, %d errors" % (count, error_count))
        return S_OK()
Пример #5
0
class JobCleaningAgent(AgentModule):
    """
    Agent for removing jobs in status "Deleted", and not only
    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        # clients
        self.jobDB = None

        self.maxJobsAtOnce = 500
        self.prodTypes = []
        self.removeStatusDelay = {}
        self.removeStatusDelayHB = {}

    #############################################################################
    def initialize(self):
        """Sets defaults"""

        self.jobDB = JobDB()

        agentTSTypes = self.am_getOption("ProductionTypes", [])
        if agentTSTypes:
            self.prodTypes = agentTSTypes
        else:
            self.prodTypes = Operations().getValue(
                "Transformations/DataProcessing", ["MCSimulation", "Merge"])
        self.log.info(
            "Will exclude the following Production types from cleaning %s" %
            (", ".join(self.prodTypes)))
        self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce",
                                               self.maxJobsAtOnce)

        self.removeStatusDelay[JobStatus.DONE] = self.am_getOption(
            "RemoveStatusDelay/Done", 7)
        self.removeStatusDelay[JobStatus.KILLED] = self.am_getOption(
            "RemoveStatusDelay/Killed", 7)
        self.removeStatusDelay[JobStatus.FAILED] = self.am_getOption(
            "RemoveStatusDelay/Failed", 7)
        self.removeStatusDelay["Any"] = self.am_getOption(
            "RemoveStatusDelay/Any", -1)

        self.removeStatusDelayHB[JobStatus.DONE] = self.am_getOption(
            "RemoveStatusDelayHB/Done", -1)
        self.removeStatusDelayHB[JobStatus.KILLED] = self.am_getOption(
            "RemoveStatusDelayHB/Killed", -1)
        self.removeStatusDelayHB[JobStatus.FAILED] = self.am_getOption(
            "RemoveStatusDelayHB/Failed", -1)
        self.maxHBJobsAtOnce = self.am_getOption("MaxHBJobsAtOnce", 0)

        return S_OK()

    def _getAllowedJobTypes(self):
        """Get valid jobTypes"""
        result = self.jobDB.getDistinctJobAttributes("JobType")
        if not result["OK"]:
            return result
        cleanJobTypes = []
        for jobType in result["Value"]:
            if jobType not in self.prodTypes:
                cleanJobTypes.append(jobType)
        self.log.notice("JobTypes to clean %s" % cleanJobTypes)
        return S_OK(cleanJobTypes)

    def execute(self):
        """Remove or delete jobs in various status"""

        # First, fully remove jobs in JobStatus.DELETED state
        result = self.removeDeletedJobs()
        if not result["OK"]:
            self.log.error("Failed to remove jobs with status %s" %
                           JobStatus.DELETED)

        # Second: set the status to JobStatus.DELETED for certain jobs

        # Get all the Job types for which we can set the status to JobStatus.DELETED
        result = self._getAllowedJobTypes()
        if not result["OK"]:
            return result

        # No jobs in the system subject to deletion
        if not result["Value"]:
            return S_OK()

        baseCond = {"JobType": result["Value"]}
        # Delete jobs with final status
        for status in self.removeStatusDelay:
            delay = self.removeStatusDelay[status]
            if delay < 0:
                # Negative delay means don't delete anything...
                continue
            condDict = dict(baseCond)
            if status != "Any":
                condDict["Status"] = status
            delTime = str(Time.dateTime() - delay * Time.day)
            result = self.deleteJobsByStatus(condDict, delTime)
            if not result["OK"]:
                self.log.error("Failed to delete jobs",
                               "with condDict %s" % condDict)

        if self.maxHBJobsAtOnce > 0:
            for status, delay in self.removeStatusDelayHB.items():
                if delay > 0:
                    self.removeHeartBeatLoggingInfo(status, delay)

        return S_OK()

    def removeDeletedJobs(self):
        """Fully remove jobs that are already in status "DELETED", unless there are still requests.

        :returns: S_OK/S_ERROR
        """

        res = self._getJobsList({"Status": JobStatus.DELETED})
        if not res["OK"]:
            return res
        jobList = res["Value"]
        if not jobList:
            self.log.info("No jobs to remove")
            return S_OK()

        self.log.info("Unassigning sandboxes from soon to be deleted jobs",
                      "(%d)" % len(jobList))
        result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
        if not result["OK"]:
            self.log.error("Cannot unassign jobs to sandboxes",
                           result["Message"])
            return result

        self.log.info("Attempting to remove deleted jobs",
                      "(%d)" % len(jobList))

        # remove from jobList those that have still Operations to do in RMS
        reqClient = ReqClient()
        res = reqClient.getRequestIDsForJobs(jobList)
        if not res["OK"]:
            return res
        if res["Value"]["Successful"]:
            notFinal = set()
            # Check whether these requests are in a final status
            for job, reqID in res["Value"]["Successful"].items():
                # If not, remove job from list to remove
                if reqClient.getRequestStatus(reqID).get(
                        "Value") not in Request.FINAL_STATES:
                    # Keep that job
                    notFinal.add(job)
                else:
                    # Remove the request, if failed, keep the job
                    res1 = reqClient.deleteRequest(reqID)
                    if not res1["OK"]:
                        notFinal.add(job)
            if notFinal:
                self.log.info(
                    "Some jobs won't be removed, as still having Requests not in final status",
                    "(n=%d)" % len(notFinal))
                jobList = list(set(jobList) - notFinal)
        if not jobList:
            return S_OK()

        ownerJobsDict = self._getOwnerJobsDict(jobList)

        fail = False
        for owner, jobsList in ownerJobsDict.items():
            ownerDN = owner.split(";")[0]
            ownerGroup = owner.split(";")[1]
            self.log.verbose(
                "Attempting to remove jobs",
                "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup))
            wmsClient = WMSClient(useCertificates=True,
                                  delegatedDN=ownerDN,
                                  delegatedGroup=ownerGroup)
            result = wmsClient.removeJob(jobsList)
            if not result["OK"]:
                self.log.error(
                    "Could not remove jobs",
                    "for %s : %s (n=%d) : %s" %
                    (ownerDN, ownerGroup, len(jobsList), result["Message"]),
                )
                fail = True

        if fail:
            return S_ERROR()

        return S_OK()

    def deleteJobsByStatus(self, condDict, delay=False):
        """Sets the job status to "DELETED" for jobs in condDict.

        :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'}
        :param int delay: days of delay
        :returns: S_OK/S_ERROR
        """

        res = self._getJobsList(condDict, delay)
        if not res["OK"]:
            return res
        jobList = res["Value"]
        if not jobList:
            return S_OK()

        self.log.notice("Attempting to delete jobs",
                        "(%d for %s)" % (len(jobList), condDict))

        result = self.deleteJobOversizedSandbox(
            jobList)  # This might set a request
        if not result["OK"]:
            self.log.error("Cannot schedule removal of oversized sandboxes",
                           result["Message"])
            return result

        failedJobs = result["Value"][JobStatus.FAILED]
        for job in failedJobs:
            jobList.pop(jobList.index(job))
        if not jobList:
            return S_OK()

        ownerJobsDict = self._getOwnerJobsDict(jobList)

        fail = False
        for owner, jobsList in ownerJobsDict.items():
            ownerDN = owner.split(";")[0]
            ownerGroup = owner.split(";")[1]
            self.log.verbose(
                "Attempting to delete jobs",
                "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup))
            wmsClient = WMSClient(useCertificates=True,
                                  delegatedDN=ownerDN,
                                  delegatedGroup=ownerGroup)
            result = wmsClient.deleteJob(jobsList)
            if not result["OK"]:
                self.log.error(
                    "Could not delete jobs",
                    "for %s : %s (n=%d) : %s" %
                    (ownerDN, ownerGroup, len(jobsList), result["Message"]),
                )
                fail = True

        if fail:
            return S_ERROR()

        return S_OK()

    def _getJobsList(self, condDict, delay=None):
        """Get jobs list according to conditions

        :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'}
        :param int delay: days of delay
        :returns: S_OK with jobsList
        """
        jobIDsS = set()
        delayStr = "and older than %s" % delay if delay else ""
        self.log.info("Get jobs with %s %s" % (str(condDict), delayStr))
        for order in ["JobID:ASC", "JobID:DESC"]:
            result = self.jobDB.selectJobs(condDict,
                                           older=delay,
                                           orderAttribute=order,
                                           limit=self.maxJobsAtOnce)
            if not result["OK"]:
                return result
            jobIDsS = jobIDsS.union({int(jID) for jID in result["Value"]})

        return S_OK(list(jobIDsS))

    def _getOwnerJobsDict(self, jobList):
        """
        :param list jobList: list of int(JobID)

        :returns: a dict with a grouping of them by owner, e.g.{'dn;group': [1, 3, 4], 'dn;group_1': [5], 'dn_1;group': [2]}
        """
        res = self.jobDB.getJobsAttributes(jobList, ["OwnerDN", "OwnerGroup"])
        if not res["OK"]:
            self.log.error("Could not get the jobs attributes", res["Message"])
            return res
        jobsDictAttribs = res["Value"]

        ownerJobsDict = {}
        for jobID, jobDict in jobsDictAttribs.items():
            ownerJobsDict.setdefault(";".join(jobDict.values()),
                                     []).append(jobID)
        return ownerJobsDict

    def deleteJobOversizedSandbox(self, jobIDList):
        """
        Deletes the job oversized sandbox files from storage elements.
        Creates a request in RMS if not immediately possible.

        :param list jobIDList: list of job IDs
        :returns: S_OK/S_ERROR
        """

        failed = {}
        successful = {}

        result = JobMonitoringClient().getJobParameters(
            jobIDList, ["OutputSandboxLFN"])
        if not result["OK"]:
            return result
        osLFNDict = result["Value"]
        if not osLFNDict:
            return S_OK({"Successful": successful, "Failed": failed})
        osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1])

        self.log.verbose("Deleting oversized sandboxes", osLFNDict)
        # Schedule removal of the LFNs now
        for jobID, outputSandboxLFNdict in osLFNDict.items(
        ):  # can be an iterator
            lfn = outputSandboxLFNdict["OutputSandboxLFN"]
            result = self.jobDB.getJobAttributes(jobID,
                                                 ["OwnerDN", "OwnerGroup"])
            if not result["OK"]:
                failed[jobID] = lfn
                continue
            if not result["Value"]:
                failed[jobID] = lfn
                continue

            ownerDN = result["Value"]["OwnerDN"]
            ownerGroup = result["Value"]["OwnerGroup"]
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result["OK"]:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {"Successful": successful, "Failed": failed}
        return S_OK(result)

    def __setRemovalRequest(self, lfn, ownerDN, ownerGroup):
        """Set removal request with the given credentials"""
        oRequest = Request()
        oRequest.OwnerDN = ownerDN
        oRequest.OwnerGroup = ownerGroup
        oRequest.RequestName = os.path.basename(
            lfn).strip() + "_removal_request.xml"
        oRequest.SourceComponent = "JobCleaningAgent"

        removeFile = Operation()
        removeFile.Type = "RemoveFile"

        removedFile = File()
        removedFile.LFN = lfn

        removeFile.addFile(removedFile)
        oRequest.addOperation(removeFile)

        # put the request with the owner certificate to make sure it's still a valid DN
        return ReqClient(useCertificates=True,
                         delegatedDN=ownerDN,
                         delegatedGroup=ownerGroup).putRequest(oRequest)

    def removeHeartBeatLoggingInfo(self, status, delayDays):
        """Remove HeartBeatLoggingInfo for jobs with given status after given number of days.

        :param str status: Job Status
        :param int delayDays: number of days after which information is removed
        :returns: None
        """
        self.log.info(
            "Removing HeartBeatLoggingInfo for Jobs with %s and older than %s day(s)"
            % (status, delayDays))
        delTime = str(Time.dateTime() - delayDays * Time.day)
        result = self.jobDB.removeInfoFromHeartBeatLogging(
            status, delTime, self.maxHBJobsAtOnce)
        if not result["OK"]:
            self.log.error("Failed to delete from HeartBeatLoggingInfo",
                           result["Message"])
        else:
            self.log.info("Deleted HeartBeatLogging info")
        return
Пример #6
0
class TaskAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def initialize( self ):
    self.__taskDB = TaskDB()
    self.__jobDB = JobDB()
    return S_OK()

  def execute( self ):
    """ Main execution method
    """
    condDict = { 'Status': ['Ready', 'Processing', 'Finished'] }
    result = self.__taskDB.getTasks( [ 'TaskID', 'Status' ], condDict )
    if not result['OK']:
      return result

    tasks = result['Value']

    self.log.info( '%d tasks will be refreshed' % len(tasks) )

    for task in tasks:
      taskID = task[0]
      status = task[1]

      if status in ['Ready', 'Processing', 'Finished']:
        self.__refreshTask( taskID )

    return S_OK()


  def __refreshTask( self, taskID ):
    result = self.__refreshTaskStringAttribute( taskID, 'Site' )
    if result['OK']:
      self.log.debug( 'Task %d site is refreshed' % taskID )
    else:
      self.log.error( 'Task %d site refresh failed: %s' % ( taskID, result['Message'] ) )

    result = self.__refreshTaskStringAttribute( taskID, 'JobGroup' )
    if result['OK']:
      self.log.debug( 'Task %d job group is refreshed' % taskID )
    else:
      self.log.error( 'Task %d job group refresh failed: %s' % ( taskID, result['Message'] ) )

    result = self.__refreshTaskStatus( taskID )
    if result['OK']:
      self.log.debug( 'Task %d status is refreshed' % taskID )
    else:
      self.log.error( 'Task %d status refresh failed: %s' % ( taskID, result['Message'] ) )


################################################################################

  def __getTaskProgress( self, taskID ):
    result = self.__taskDB.getTaskJobs( taskID )
    if not result['OK']:
      return result
    jobIDs = result['Value']

    result = self.__jobDB.getAttributesForJobList( jobIDs, ['Status'] )
    if not result['OK']:
      return result
    statuses = result['Value']

    progress = { 'Total': 0, 'Done': 0, 'Failed': 0, 'Running': 0, 'Waiting': 0, 'Deleted': 0 }
    progress['Total'] = len(jobIDs)
    for jobID in jobIDs:
      if jobID in statuses:
        status = statuses[jobID]['Status']
        if status in ['Done']:
          progress['Done'] += 1
        elif status in ['Failed', 'Stalled', 'Killed']:
          progress['Failed'] += 1
        elif status in ['Running', 'Completed']:
          progress['Running'] += 1
        else:
          progress['Waiting'] += 1
      else:
        progress['Deleted'] += 1

    return S_OK( progress )

  def __analyseTaskStatus( self, progress ):
    totalJob = progress.get( 'Total', 0 )
    runningJob = progress.get( 'Running', 0 )
    waitingJob = progress.get( 'Waiting', 0 )
    deletedJob = progress.get( 'Deleted', 0 )

    status = 'Unknown'
    if deletedJob == totalJob:
      status = 'Expired'
    elif runningJob == 0 and waitingJob == 0:
      status = 'Finished'
    else:
      status = 'Processing'

    return status

  def __refreshTaskStatus( self, taskID ):
    """ Refresh the task status
    """
    # get task progress from the job list
    result = self.__getTaskProgress( taskID )
    if not result['OK']:
      return result
    progress = result['Value']
    self.log.debug( 'Task %d Progress: %s' % ( taskID, progress ) )
    result = self.__taskDB.updateTaskProgress( taskID, progress )
    if not result['OK']:
      return result

    # get previous task status
    result = self.__taskDB.getTaskStatus( taskID )
    if not result['OK']:
      return result
    status = result['Value']

    # get current task status from the progress
    newStatus = self.__analyseTaskStatus( progress )
    self.log.debug( 'Task %d new status: %s' % ( taskID, newStatus ) )
    if newStatus != status:
      self.__taskDB.updateTaskStatus( taskID, newStatus, 'Status refreshed' )
      if not result['OK']:
        return result

    return S_OK( newStatus )


################################################################################

  def __getTaskAttribute( self, taskID, attributeType ):
    """ Get all attributes of the jobs in the task
    """
    result = self.__taskDB.getTaskJobs( taskID )
    if not result['OK']:
      return result
    jobIDs = result['Value']

    condDict = { 'JobID': jobIDs }

    result = self.__jobDB.getDistinctJobAttributes( attributeType, condDict )
    if not result['OK']:
      return result
    attributes = result['Value']

    return S_OK( attributes )

  def __refreshTaskStringAttribute( self, taskID, attributeType ):
    """ Refresh the task attribute. The attribute type must be string and seperated by comma
    """
    # get task attibutes from the job list
    result = self.__getTaskAttribute( taskID, attributeType )
    if not result['OK']:
      return result
    newAttributes = result['Value']

    # get previous task attributes
    result = self.__taskDB.getTask( taskID, [attributeType] )
    if not result['OK']:
      return result
    oldAttributes = result['Value'][0].split( ',' )

    # check whether there are differences
    if set( newAttributes ) == set( oldAttributes ):
      self.log.debug( 'Task %s attribute is the same: %s' % (attributeType, oldAttributes) )
      return S_OK( oldAttributes )

    # make a combination of old and new attributes
    attributes = list( set( oldAttributes ) | set( newAttributes ) )
    for emptyAttr in [ '', 'ANY', 'Multiple' ]:
      if emptyAttr in attributes:
        attributes.remove( emptyAttr )

    # generate a new attribute
    allAttributes = ','.join( attributes )
    result = self.__taskDB.updateTask( taskID, [attributeType], [allAttributes] )
    if not result['OK']:
      return result

    return S_OK( allAttributes )
Пример #7
0
class TaskAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """
    def initialize(self):
        self.__taskDB = TaskDB()
        self.__jobDB = JobDB()
        return S_OK()

    def execute(self):
        """ Main execution method
    """
        condDict = {'Status': ['Ready', 'Processing', 'Finished']}
        result = self.__taskDB.getTasks(['TaskID', 'Status'], condDict)
        if not result['OK']:
            return result

        tasks = result['Value']

        self.log.info('%d tasks will be refreshed' % len(tasks))

        for task in tasks:
            taskID = task[0]
            status = task[1]

            if status in ['Ready', 'Processing', 'Finished']:
                self.__refreshTask(taskID)

        return S_OK()

    def __refreshTask(self, taskID):
        result = self.__refreshTaskStringAttribute(taskID, 'Site')
        if result['OK']:
            self.log.debug('Task %d site is refreshed' % taskID)
        else:
            self.log.error('Task %d site refresh failed: %s' %
                           (taskID, result['Message']))

        result = self.__refreshTaskStringAttribute(taskID, 'JobGroup')
        if result['OK']:
            self.log.debug('Task %d job group is refreshed' % taskID)
        else:
            self.log.error('Task %d job group refresh failed: %s' %
                           (taskID, result['Message']))

        result = self.__refreshTaskStatus(taskID)
        if result['OK']:
            self.log.debug('Task %d status is refreshed' % taskID)
        else:
            self.log.error('Task %d status refresh failed: %s' %
                           (taskID, result['Message']))

################################################################################

    def __getTaskProgress(self, taskID):
        result = self.__taskDB.getTaskJobs(taskID)
        if not result['OK']:
            return result
        jobIDs = result['Value']

        result = self.__jobDB.getAttributesForJobList(jobIDs, ['Status'])
        if not result['OK']:
            return result
        statuses = result['Value']

        progress = {
            'Total': 0,
            'Done': 0,
            'Failed': 0,
            'Running': 0,
            'Waiting': 0,
            'Deleted': 0
        }
        progress['Total'] = len(jobIDs)
        for jobID in jobIDs:
            if jobID in statuses:
                status = statuses[jobID]['Status']
                if status in ['Done']:
                    progress['Done'] += 1
                elif status in ['Failed', 'Stalled', 'Killed']:
                    progress['Failed'] += 1
                elif status in ['Running', 'Completed']:
                    progress['Running'] += 1
                else:
                    progress['Waiting'] += 1
            else:
                progress['Deleted'] += 1

        return S_OK(progress)

    def __analyseTaskStatus(self, progress):
        totalJob = progress.get('Total', 0)
        runningJob = progress.get('Running', 0)
        waitingJob = progress.get('Waiting', 0)
        deletedJob = progress.get('Deleted', 0)

        status = 'Unknown'
        if deletedJob == totalJob:
            status = 'Expired'
        elif runningJob == 0 and waitingJob == 0:
            status = 'Finished'
        else:
            status = 'Processing'

        return status

    def __refreshTaskStatus(self, taskID):
        """ Refresh the task status
    """
        # get task progress from the job list
        result = self.__getTaskProgress(taskID)
        if not result['OK']:
            return result
        progress = result['Value']
        self.log.debug('Task %d Progress: %s' % (taskID, progress))
        result = self.__taskDB.updateTaskProgress(taskID, progress)
        if not result['OK']:
            return result

        # get previous task status
        result = self.__taskDB.getTaskStatus(taskID)
        if not result['OK']:
            return result
        status = result['Value']

        # get current task status from the progress
        newStatus = self.__analyseTaskStatus(progress)
        self.log.debug('Task %d new status: %s' % (taskID, newStatus))
        if newStatus != status:
            self.__taskDB.updateTaskStatus(taskID, newStatus,
                                           'Status refreshed')
            if not result['OK']:
                return result

        return S_OK(newStatus)


################################################################################

    def __getTaskAttribute(self, taskID, attributeType):
        """ Get all attributes of the jobs in the task
    """
        result = self.__taskDB.getTaskJobs(taskID)
        if not result['OK']:
            return result
        jobIDs = result['Value']

        condDict = {'JobID': jobIDs}

        result = self.__jobDB.getDistinctJobAttributes(attributeType, condDict)
        if not result['OK']:
            return result
        attributes = result['Value']

        return S_OK(attributes)

    def __refreshTaskStringAttribute(self, taskID, attributeType):
        """ Refresh the task attribute. The attribute type must be string and seperated by comma
    """
        # get task attibutes from the job list
        result = self.__getTaskAttribute(taskID, attributeType)
        if not result['OK']:
            return result
        newAttributes = result['Value']

        # get previous task attributes
        result = self.__taskDB.getTask(taskID, [attributeType])
        if not result['OK']:
            return result
        oldAttributes = result['Value'][0].split(',')

        # check whether there are differences
        if set(newAttributes) == set(oldAttributes):
            self.log.debug('Task %s attribute is the same: %s' %
                           (attributeType, oldAttributes))
            return S_OK(oldAttributes)

        # make a combination of old and new attributes
        attributes = list(set(oldAttributes) | set(newAttributes))
        for emptyAttr in ['', 'ANY', 'Multiple']:
            if emptyAttr in attributes:
                attributes.remove(emptyAttr)

        # generate a new attribute
        allAttributes = ','.join(attributes)
        result = self.__taskDB.updateTask(taskID, [attributeType],
                                          [allAttributes])
        if not result['OK']:
            return result

        return S_OK(allAttributes)
Пример #8
0
class JobCleaningAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        self.jobLoggingDB = JobLoggingDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        agentTSTypes = self.am_getOption('ProductionTypes', [])
        if agentTSTypes:
            self.prod_types = agentTSTypes
        else:
            self.prod_types = Operations().getValue(
                'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
        gLogger.info(
            'Will exclude the following Production types from cleaning %s' %
            (string.join(self.prod_types, ', ')))
        self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 100)
        self.jobByJob = self.am_getOption('JobByJob', True)
        self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
        return S_OK()

    def __getAllowedJobTypes(self):
        #Get valid jobTypes
        result = self.jobDB.getDistinctJobAttributes('JobType')
        if not result['OK']:
            return result
        cleanJobTypes = []
        for jobType in result['Value']:
            if jobType not in self.prod_types:
                cleanJobTypes.append(jobType)
        self.log.notice("JobTypes to clean %s" % cleanJobTypes)
        return S_OK(cleanJobTypes)

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """
        #Delete jobs in "Deleted" state
        result = self.removeJobsByStatus({'Status': 'Deleted'})
        if not result['OK']:
            return result
        #Get all the Job types that can be cleaned
        result = self.__getAllowedJobTypes()
        if not result['OK']:
            return result
        baseCond = {'JobType': result['Value']}
        # Remove jobs with final status
        for status in REMOVE_STATUS_DELAY:
            delay = REMOVE_STATUS_DELAY[status]
            condDict = dict(baseCond)
            condDict['Status'] = status
            delTime = str(Time.dateTime() - delay * Time.day)
            result = self.removeJobsByStatus(condDict, delTime)
            if not result['OK']:
                gLogger.warn('Failed to remove jobs in status %s' % status)
        return S_OK()

    def removeJobsByStatus(self, condDict, delay=False):
        """ Remove deleted jobs
    """
        if delay:
            gLogger.verbose("Removing jobs with %s and older than %s" %
                            (condDict, delay))
            result = self.jobDB.selectJobs(condDict,
                                           older=delay,
                                           limit=self.maxJobsAtOnce)
        else:
            gLogger.verbose("Removing jobs with %s " % condDict)
            result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce)

        if not result['OK']:
            return result

        jobList = result['Value']
        if len(jobList) > self.maxJobsAtOnce:
            jobList = jobList[:self.maxJobsAtOnce]
        if not jobList:
            return S_OK()

        self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict))

        count = 0
        error_count = 0
        result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
        if not result['OK']:
            gLogger.warn("Cannot unassign jobs to sandboxes",
                         result['Message'])

        result = self.deleteJobOversizedSandbox(jobList)
        if not result['OK']:
            gLogger.warn("Cannot schedle removal of oversized sandboxes",
                         result['Message'])
            return result

        failedJobs = result['Value']['Failed']
        for job in failedJobs:
            jobList.pop(jobList.index(job))

        if self.jobByJob:
            for jobID in jobList:
                resultJobDB = self.jobDB.removeJobFromDB(jobID)
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                resultLogDB = self.jobLoggingDB.deleteJob(jobID)
                errorFlag = False
                if not resultJobDB['OK']:
                    gLogger.warn('Failed to remove job %d from JobDB' % jobID,
                                 result['Message'])
                    errorFlag = True
                if not resultTQ['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from TaskQueueDB' % jobID,
                        result['Message'])
                    errorFlag = True
                if not resultLogDB['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from JobLoggingDB' % jobID,
                        result['Message'])
                    errorFlag = True
                if errorFlag:
                    error_count += 1
                else:
                    count += 1
                if self.throttlingPeriod:
                    time.sleep(self.throttlingPeriod)
        else:
            result = self.jobDB.removeJobFromDB(jobList)
            if not result['OK']:
                gLogger.error('Failed to delete %d jobs from JobDB' %
                              len(jobList))
            else:
                gLogger.info('Deleted %d jobs from JobDB' % len(jobList))

            for jobID in jobList:
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultTQ['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from TaskQueueDB' % jobID,
                        resultTQ['Message'])
                    error_count += 1
                else:
                    count += 1

            result = self.jobLoggingDB.deleteJob(jobList)
            if not result['OK']:
                gLogger.error('Failed to delete %d jobs from JobLoggingDB' %
                              len(jobList))
            else:
                gLogger.info('Deleted %d jobs from JobLoggingDB' %
                             len(jobList))

        if count > 0 or error_count > 0:
            gLogger.info('Deleted %d jobs from JobDB, %d errors' %
                         (count, error_count))
        return S_OK()

    def deleteJobOversizedSandbox(self, jobIDList):
        """ Delete the job oversized sandbox files from storage elements
    """

        failed = {}
        successful = {}

        lfnDict = {}
        for jobID in jobIDList:
            result = self.jobDB.getJobParameter(jobID, 'OutputSandboxLFN')
            if result['OK']:
                lfn = result['Value']
                if lfn:
                    lfnDict[lfn] = jobID
                else:
                    successful[jobID] = 'No oversized sandbox found'
            else:
                gLogger.warn('Error interrogting JobDB: %s' %
                             result['Message'])
        if not lfnDict:
            return S_OK({'Successful': successful, 'Failed': failed})

        # Schedule removal of the LFNs now

        for lfn, jobID in lfnDict.items():
            result = self.jobDB.getJobAttributes(jobID,
                                                 ['OwnerDN', 'OwnerGroup'])
            if not result['OK']:
                failed[jobID] = lfn
                continue
            if not result['Value']:
                failed[jobID] = lfn
                continue

            ownerDN = result['Value']['OwnerDN']
            ownerGroup = result['Value']['OwnerGroup']
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result['OK']:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {'Successful': successful, 'Failed': failed}
        return S_OK(result)

    def __setRemovalRequest(self, lfn, ownerDN, ownerGroup):
        """ Set removal request with the given credentials
    """
        request = RequestContainer()
        request.setRequestAttributes({
            'OwnerDN': ownerDN,
            'OwnerGroup': ownerGroup
        })
        requestName = os.path.basename(lfn).strip() + '_removal_request.xml'
        request.setRequestName(requestName)
        request.setSourceComponent('JobCleaningAgent')

        removalDict = {
            'Attributes': {
                'Operation': 'removeFile',
                'TargetSE': '',
                'ExecutionOrder': 0
            }
        }
        result = request.addSubRequest(removalDict, 'removal')
        if not result['OK']:
            return result

        index = result['Value']
        fileDict = {'LFN': lfn, 'PFN': '', 'Status': 'Waiting'}
        request.setSubRequestFiles(index, 'removal', [fileDict])

        client = RequestClient()
        result = request.toXML()
        if not result['OK']:
            return result
        xmlRequest = result['Value']
        result = client.setRequest(requestName, xmlRequest)
        return result
Пример #9
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:

         *  initialize() for initial settings
         *  beginExecution()
         *  execute() - the main method called in the agent cycle
         *  endExecution()
         *  finalize() - the graceful exit of the method, this one is usually used for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.prodTypes = []

    self.removeStatusDelay = {}

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prodTypes = agentTSTypes
    else:
      self.prodTypes = Operations().getValue(
          'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
    gLogger.info("Will exclude the following Production types from cleaning %s" % (
        ', '.join(self.prodTypes)))
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prodTypes:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result

    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()

    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      if delay < 0:
        # Negative delay means don't delete anything...
        continue
      condDict = dict( baseCond )
      if status != 'Any':
        condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.error("Cannot unassign jobs to sandboxes", result['Message'])
      return result

    result = self.deleteJobOversizedSandbox(jobList)
    if not result[ 'OK' ]:
      gLogger.error(
          "Cannot schedule removal of oversized sandboxes", result['Message'])
      return result

    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop(jobList.index(job))

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:
          error_count += 1
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)
    else:
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )

      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK({'Successful': successful, 'Failed': failed})

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn

    result = {'Successful':successful, 'Failed':failed}
    return S_OK(result)

  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Пример #10
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()