Exemplo n.º 1
0
    def getFTSPlacement(cls):
        """ fts placement getter """

        ftsHistory = cls.ftsDB.getFTSHistory()
        if not ftsHistory['OK']:
            gLogger.warn("unable to get FTSHistory for FTSPlacement: %s" %
                         ftsHistory['Message'])
            ftsHistory['Value'] = []
        ftsHistory = ftsHistory['Value']

        return FTSPlacement(ftsHistoryViews=ftsHistory)
Exemplo n.º 2
0
  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()
Exemplo n.º 3
0
  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()
Exemplo n.º 4
0
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if ( now - job.LastUpdate ).seconds > ( self.MONITORING_INTERVAL * ( 3. if job.Status == 'Staging' else 1. ) )]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            for lfn in missingReplicas:
              if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                for opFile in operation:
                  if opFile.LFN == lfn:
                    opFile.Status = 'Done'
                    break
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise escapeTry

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except escapeTry:
      # This clause is raised when one wants to return from within the try: clause
      pass
    except Exception, exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
Exemplo n.º 5
0
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if
                       ( now - job.LastUpdate ).seconds >
                       ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) )
                       ]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )
        if not jobsToMonitor:
          # Nothing to happen this time, escape
          raise EscapeTryException

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            ftsLFNs = [f.LFN for f in ftsFiles]
            for lfn in missingReplicas:
              # We make sure here that the file is being processed by FTS
              if lfn in ftsLFNs:
                if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                  for opFile in operation:
                    if opFile.LFN == lfn:
                      opFile.Status = 'Done'
                      break
              else:
                # Temporary log
                log.warn( "File with missing replica not in FTS files", lfn )
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise EscapeTryException

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except EscapeTryException:
      # This clause is raised when one wants to return from within the try: clause
      # only put back jobs that were monitored
      ftsJobs = jobsToMonitor
    except Exception as exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
      putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) )
      if not putRequest["OK"]:
        log.error( "unable to put back request:", putRequest["Message"] )
      # #  put back jobs in all cases
      if ftsJobs:
        for ftsJob in list( ftsJobs ):
          if not len( ftsJob ):
            log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID )
            self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
            ftsJobs.remove( ftsJob )
        putJobs = self.putFTSJobs( ftsJobs )
        if not putJobs["OK"]:
          log.error( "unable to put back FTSJobs:", putJobs["Message"] )
          putRequest = putJobs
    # This is where one returns from after execution of the finally: block
    return putRequest

  def __checkDuplicates( self, reqID, toSubmit ):
    """ Check in a list of FTSFiles whether there are duplicates
    """
    tupleList = []
    log = self.log.getSubLogger( "%s/checkDuplicates" % reqID )
    for ftsFile in list( toSubmit ):
      fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE )
      if fTuple in tupleList:
        log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) )
        toSubmit.remove( ftsFile )
        self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] )
      else:
        tupleList.append( fTuple )


  def __reschedule( self, request, operation, toReschedule ):
    """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
    log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) )

    ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule]
    for opFile in operation:
      if opFile.FileID in ftsFileIDs:
        opFile.Status = "Waiting"

    toSchedule = []

    # # filter files
    for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]:

      replicas = self.__filterReplicas( opFile )
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]
      validReplicas = replicas["Valid"]
      noMetaReplicas = replicas["NoMetadata"]
      noReplicas = replicas["NoReplicas"]
      badReplicas = replicas['Bad']

      if validReplicas:
        validTargets = list( set( operation.targetSEList ) - set( validReplicas ) )
        if not validTargets:
          log.info( "file %s is already present at all targets" % opFile.LFN )
          opFile.Status = "Done"
        else:
          toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) )
      elif noMetaReplicas:
        log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) )
      elif noReplicas:
        log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) )
        opFile.Status = 'Failed'
      elif badReplicas:
        log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) )
        opFile.Status = 'Failed'

    # # do real schedule here
    if toSchedule:
      log.info( "Rescheduling %d files" % len( toReschedule ) )
      ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID,
                                                  operation.OperationID,
                                                  toSchedule )
      if not ftsSchedule["OK"]:
        log.error( "Error scheduling files", ftsSchedule["Message"] )
        return ftsSchedule

      ftsSchedule = ftsSchedule["Value"]
      for opFile in operation:
        fileID = opFile.FileID
        if fileID in ftsSchedule["Successful"]:
          opFile.Status = "Scheduled"
        elif fileID in ftsSchedule["Failed"]:
          opFile.Error = ftsSchedule["Failed"][fileID]
          log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error )

    return S_OK()

  def __submit( self, request, operation, toSubmit ):
    """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
    log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) )

    bySourceAndTarget = {}
    for ftsFile in toSubmit:
      if ftsFile.SourceSE not in bySourceAndTarget:
        bySourceAndTarget.setdefault( ftsFile.SourceSE, {} )
      if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
        bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] )
      bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile )

    ftsJobs = []

    for source, targetDict in bySourceAndTarget.iteritems():

      for target, ftsFileList in targetDict.iteritems():

        log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) )

        route = self.__ftsPlacement.findRoute( source, target )
        if not route["OK"]:
          log.error( route["Message"] )
          continue
        route = route["Value"]

        routeValid = self.__ftsPlacement.isRouteValid( route )

        if not routeValid['OK']:
          log.error( "Route invalid : %s" % routeValid['Message'] )
          continue

        sourceSE = StorageElement( source )
        sourceToken = sourceSE.getStorageParameters( protocol = 'srm' )
        if not sourceToken["OK"]:
          log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) )
          continue
        seStatus = sourceSE.getStatus()['Value']

        targetSE = StorageElement( target )
        targetToken = targetSE.getStorageParameters( protocol = 'srm' )
        if not targetToken["OK"]:
          log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) )
          continue

        # # create FTSJob
        for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ):
          ftsJob = FTSJob()
          ftsJob.RequestID = request.RequestID
          ftsJob.OperationID = operation.OperationID
          ftsJob.SourceSE = source
          ftsJob.TargetSE = target
          ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" )
          ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" )
          ftsJob.FTSServer = route.ftsServer

          for ftsFile in fileList:
            ftsFile.Attempt += 1
            ftsFile.Error = ""
            ftsJob.addFile( ftsFile )

          submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 )
          if not submit["OK"]:
            log.error( "unable to submit FTSJob:", submit["Message"] )
            continue

          log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

          # # update statuses for job files
          for ftsFile in ftsJob:
            ftsFile.FTSGUID = ftsJob.FTSGUID
            ftsFile.Status = "Submitted"
            ftsFile.Attempt += 1



          # # update placement route
          try:
            self.updateLock().acquire()
            self.__ftsPlacement.startTransferOnRoute( route )
          finally:
            self.updateLock().release()

          ftsJobs.append( ftsJob )

    log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) )
    return S_OK( ftsJobs )

  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )

  def __finalizeFTSJob( self, request, ftsJob ):
    """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID,
                                                                     request.RequestName,
                                                                     ftsJob.FTSJobID ) )
    log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )


    monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True )
    if not monitor["OK"]:
      log.error( monitor["Message"] )
      return monitor

    # # split FTSFiles to different categories
    processFiles = self.__filterFiles( ftsJob )
    if not processFiles["OK"]:
      log.error( processFiles["Message"] )
      return processFiles
    processFiles = processFiles['Value']
    if processFiles['toRegister']:
      log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) )
    ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles )

    # # send accounting record for this job
    self.__sendAccounting( ftsJob, request.OwnerDN )

    # # update placement - remove this job from placement
    route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE )
    if route["OK"]:
      try:
        self.updateLock().acquire()
        self.__ftsPlacement.finishTransferOnRoute( route['Value'] )
      finally:
        self.updateLock().release()

    log.info( "FTSJob is finalized" )

    return S_OK( ftsFilesDict )

  def __checkFailed( self, ftsFile ):
    reschedule = False
    submit = False
    fail = False
    if ftsFile.Status in ( "Failed", 'Canceled' ):
      if ftsFile.Error == "MissingSource":
        reschedule = True
      else:
        if ftsFile.Attempt < self.MAX_ATTEMPT:
          submit = True
        else:
          fail = True
    return reschedule, submit, fail

  def __filterFiles( self, ftsJob ):
    """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
    # # lists for different categories
    toUpdate = []
    toReschedule = []
    toRegister = []
    toSubmit = []
    toFail = []

    # # loop over files in fts job
    for ftsFile in ftsJob:
      # # successful files
      if ftsFile.Status == "Finished":
        if ftsFile.Error == "AddCatalogReplicaFailed":
          toRegister.append( ftsFile )
        toUpdate.append( ftsFile )
        continue
      reschedule, submit, fail = self.__checkFailed( ftsFile )
      if reschedule:
        toReschedule.append( ftsFile )
      elif submit:
        toSubmit.append( ftsFile )
      elif fail:
        toFail.append( ftsFile )

    return S_OK( { "toUpdate": toUpdate,
                   "toSubmit": toSubmit,
                   "toRegister": toRegister,
                   "toReschedule": toReschedule,
                   "toFail": toFail } )

  def __insertRegisterOperation( self, request, operation, toRegister ):
    """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
    log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) )

    byTarget = {}
    for ftsFile in toRegister:
      if ftsFile.TargetSE not in byTarget:
        byTarget.setdefault( ftsFile.TargetSE, [] )
      byTarget[ftsFile.TargetSE].append( ftsFile )
    log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) )

    for target, ftsFileList in byTarget.iteritems():
      log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target,
                                                                                            len( ftsFileList ) ) )
      registerOperation = Operation()
      registerOperation.Type = "RegisterReplica"
      registerOperation.Status = "Waiting"
      registerOperation.TargetSE = target
      targetSE = StorageElement( target )
      for ftsFile in ftsFileList:
        opFile = File()
        opFile.LFN = ftsFile.LFN
        pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) )
        if not pfn["OK"]:
          continue
        opFile.PFN = pfn["Value"]
        registerOperation.addFile( opFile )
      request.insertBefore( registerOperation, operation )

    return S_OK()

  @staticmethod
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    # dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    # transferTime = dt.days * 86400 + dt.seconds
    # accountingDict["TransferTime"] = transferTime
    accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES )
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()

  def __checkReadyReplicas( self, request, operation ):
    """ check ready replicas for transferOperation """
    log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) )

    targetSESet = set( operation.targetSEList )

    # # { LFN: [ targetSE, ... ] }
    missingReplicas = {}

    scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) )
    # # get replicas
    replicas = FileCatalog().getReplicas( scheduledFiles.keys() )
    if not replicas["OK"]:
      self.log.error( replicas["Message"] )
      return replicas
    replicas = replicas["Value"]

    fullyReplicated = 0
    missingSEs = {}
    for successfulLFN in replicas["Successful"]:
      reps = set( replicas['Successful'][successfulLFN] )
      if targetSESet.issubset( reps ):
        log.verbose( "%s has been replicated to all targets" % successfulLFN )
        fullyReplicated += 1
        scheduledFiles[successfulLFN].Status = "Done"
      else:
        missingReplicas[successfulLFN] = sorted( targetSESet - reps )
        ses = ",".join( missingReplicas[ successfulLFN ] )
        missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1
        log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) )
    if fullyReplicated:
      log.info( "%d new files have been replicated to all targets" % fullyReplicated )
    if missingSEs:
      for ses in missingSEs:
        log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) )

    reMissing = re.compile( "no such file or directory" )
    for failedLFN, errStr in replicas["Failed"].iteritems():
      scheduledFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        log.error( "%s is missing, setting its status to 'Failed'" % failedLFN )
        scheduledFiles[failedLFN].Status = "Failed"
      else:
        log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) )

    return S_OK( missingReplicas )

  def __filterReplicas( self, opFile ):
    """ filter out banned/invalid source SEs """
    from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas
    return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )
Exemplo n.º 6
0
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """

    # Make sure the request is Scheduled
    res = cls.requestClient().getRequestStatus( reqID )
    if not res['OK']:
      cls.__reqCache.pop( reqID, None )
      return res
    status = res['Value']
    if status != 'Scheduled':
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "Request with id %s is not Scheduled:%s" % ( reqID, status ) )

    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = DMSHelpers().getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if
                       ( now - job.LastUpdate ).seconds >
                       ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) )
                       ]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )
        if not jobsToMonitor:
          # Nothing to happen this time, escape
          raise EscapeTryException

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            ftsLFNs = [f.LFN for f in ftsFiles]
            for lfn in missingReplicas:
              # We make sure here that the file is being processed by FTS
              if lfn in ftsLFNs:
                if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                  for opFile in operation:
                    if opFile.LFN == lfn:
                      opFile.Status = 'Done'
                      break
              else:
                # Temporary log
                log.warn( "File with missing replica not in FTS files", lfn )
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise EscapeTryException

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except EscapeTryException:
      # This clause is raised when one wants to return from within the try: clause
      # only put back jobs that were monitored
      ftsJobs = jobsToMonitor
    except Exception as exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
      putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) )
      if not putRequest["OK"]:
        log.error( "unable to put back request:", putRequest["Message"] )
      # #  put back jobs in all cases
      if ftsJobs:
        for ftsJob in list( ftsJobs ):
          if not len( ftsJob ):
            log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID )
            self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
            ftsJobs.remove( ftsJob )
        putJobs = self.putFTSJobs( ftsJobs )
        if not putJobs["OK"]:
          log.error( "unable to put back FTSJobs:", putJobs["Message"] )
          putRequest = putJobs
    # This is where one returns from after execution of the finally: block
    return putRequest

  def __checkDuplicates( self, reqID, toSubmit ):
    """ Check in a list of FTSFiles whether there are duplicates
    """
    tupleList = []
    log = self.log.getSubLogger( "%s/checkDuplicates" % reqID )
    for ftsFile in list( toSubmit ):
      fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE )
      if fTuple in tupleList:
        log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) )
        toSubmit.remove( ftsFile )
        self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] )
      else:
        tupleList.append( fTuple )


  def __reschedule( self, request, operation, toReschedule ):
    """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
    log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) )

    ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule]
    for opFile in operation:
      if opFile.FileID in ftsFileIDs:
        opFile.Status = "Waiting"

    toSchedule = []

    # # filter files
    for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]:

      replicas = self.__filterReplicas( opFile )
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]
      validReplicas = replicas["Valid"]
      noMetaReplicas = replicas["NoMetadata"]
      noReplicas = replicas["NoReplicas"]
      badReplicas = replicas['Bad']

      if validReplicas:
        validTargets = list( set( operation.targetSEList ) - set( validReplicas ) )
        if not validTargets:
          log.info( "file %s is already present at all targets" % opFile.LFN )
          opFile.Status = "Done"
        else:
          toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) )
      elif noMetaReplicas:
        log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) )
      elif noReplicas:
        log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) )
        opFile.Status = 'Failed'
      elif badReplicas:
        log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) )
        opFile.Status = 'Failed'

    # # do real schedule here
    if toSchedule:
      log.info( "Rescheduling %d files" % len( toReschedule ) )
      ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID,
                                                  operation.OperationID,
                                                  toSchedule )
      if not ftsSchedule["OK"]:
        log.error( "Error scheduling files", ftsSchedule["Message"] )
        return ftsSchedule

      ftsSchedule = ftsSchedule["Value"]
      for opFile in operation:
        fileID = opFile.FileID
        if fileID in ftsSchedule["Successful"]:
          opFile.Status = "Scheduled"
        elif fileID in ftsSchedule["Failed"]:
          opFile.Error = ftsSchedule["Failed"][fileID]
          log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error )

    return S_OK()

  def __submit( self, request, operation, toSubmit ):
    """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
    log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) )

    bySourceAndTarget = {}
    for ftsFile in toSubmit:
      if ftsFile.SourceSE not in bySourceAndTarget:
        bySourceAndTarget.setdefault( ftsFile.SourceSE, {} )
      if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
        bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] )
      bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile )

    ftsJobs = []

    for source, targetDict in bySourceAndTarget.iteritems():

      for target, ftsFileList in targetDict.iteritems():

        log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) )

        route = self.__ftsPlacement.findRoute( source, target )
        if not route["OK"]:
          log.error( route["Message"] )
          continue
        route = route["Value"]

        routeValid = self.__ftsPlacement.isRouteValid( route )

        if not routeValid['OK']:
          log.error( "Route invalid : %s" % routeValid['Message'] )
          continue

        sourceSE = StorageElement( source )
        sourceToken = sourceSE.getStorageParameters( protocol = 'srm' )
        if not sourceToken["OK"]:
          log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) )
          continue
        seStatus = sourceSE.getStatus()['Value']

        targetSE = StorageElement( target )
        targetToken = targetSE.getStorageParameters( protocol = 'srm' )
        if not targetToken["OK"]:
          log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) )
          continue

        # # create FTSJob
        for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ):
          ftsJob = FTSJob()
          ftsJob.RequestID = request.RequestID
          ftsJob.OperationID = operation.OperationID
          ftsJob.SourceSE = source
          ftsJob.TargetSE = target
          ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" )
          ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" )
          ftsJob.FTSServer = route.ftsServer

          for ftsFile in fileList:
            ftsFile.Attempt += 1
            ftsFile.Error = ""
            ftsJob.addFile( ftsFile )

          submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 )
          if not submit["OK"]:
            log.error( "unable to submit FTSJob:", submit["Message"] )
            continue

          log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

          # # update statuses for job files
          for ftsFile in ftsJob:
            ftsFile.FTSGUID = ftsJob.FTSGUID
            ftsFile.Status = "Submitted"
            ftsFile.Attempt += 1



          # # update placement route
          try:
            self.updateLock().acquire()
            self.__ftsPlacement.startTransferOnRoute( route )
          finally:
            self.updateLock().release()

          ftsJobs.append( ftsJob )

    log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) )
    return S_OK( ftsJobs )

  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )

  def __finalizeFTSJob( self, request, ftsJob ):
    """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID,
                                                                     request.RequestName,
                                                                     ftsJob.FTSJobID ) )
    log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )


    monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True )
    if not monitor["OK"]:
      log.error( monitor["Message"] )
      return monitor

    # # split FTSFiles to different categories
    processFiles = self.__filterFiles( ftsJob )
    if not processFiles["OK"]:
      log.error( processFiles["Message"] )
      return processFiles
    processFiles = processFiles['Value']
    if processFiles['toRegister']:
      log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) )
    ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles )

    # # send accounting record for this job
    self.__sendAccounting( ftsJob, request.OwnerDN )

    # # update placement - remove this job from placement
    route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE )
    if route["OK"]:
      try:
        self.updateLock().acquire()
        self.__ftsPlacement.finishTransferOnRoute( route['Value'] )
      finally:
        self.updateLock().release()

    log.info( "FTSJob is finalized" )

    return S_OK( ftsFilesDict )

  def __checkFailed( self, ftsFile ):
    reschedule = False
    submit = False
    fail = False
    if ftsFile.Status in ( "Failed", 'Canceled' ):
      if ftsFile.Error == "MissingSource":
        reschedule = True
      else:
        if ftsFile.Attempt < self.MAX_ATTEMPT:
          submit = True
        else:
          fail = True
    return reschedule, submit, fail

  def __filterFiles( self, ftsJob ):
    """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
    # # lists for different categories
    toUpdate = []
    toReschedule = []
    toRegister = []
    toSubmit = []
    toFail = []

    # # loop over files in fts job
    for ftsFile in ftsJob:
      # # successful files
      if ftsFile.Status == "Finished":
        if ftsFile.Error == "AddCatalogReplicaFailed":
          toRegister.append( ftsFile )
        toUpdate.append( ftsFile )
        continue
      reschedule, submit, fail = self.__checkFailed( ftsFile )
      if reschedule:
        toReschedule.append( ftsFile )
      elif submit:
        toSubmit.append( ftsFile )
      elif fail:
        toFail.append( ftsFile )

    return S_OK( { "toUpdate": toUpdate,
                   "toSubmit": toSubmit,
                   "toRegister": toRegister,
                   "toReschedule": toReschedule,
                   "toFail": toFail } )

  def __insertRegisterOperation( self, request, operation, toRegister ):
    """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
    log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) )

    byTarget = {}
    for ftsFile in toRegister:
      if ftsFile.TargetSE not in byTarget:
        byTarget.setdefault( ftsFile.TargetSE, [] )
      byTarget[ftsFile.TargetSE].append( ftsFile )
    log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) )

    for target, ftsFileList in byTarget.iteritems():
      log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target,
                                                                                            len( ftsFileList ) ) )
      registerOperation = Operation()
      registerOperation.Type = "RegisterReplica"
      registerOperation.Status = "Waiting"
      registerOperation.TargetSE = target
      targetSE = StorageElement( target )
      for ftsFile in ftsFileList:
        opFile = File()
        opFile.LFN = ftsFile.LFN
        pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) )
        if not pfn["OK"]:
          continue
        opFile.PFN = pfn["Value"]
        registerOperation.addFile( opFile )
      request.insertBefore( registerOperation, operation )

    return S_OK()

  @staticmethod
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    # dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    # transferTime = dt.days * 86400 + dt.seconds
    # accountingDict["TransferTime"] = transferTime
    accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES )
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()

  def __checkReadyReplicas( self, request, operation ):
    """ check ready replicas for transferOperation """
    log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) )

    targetSESet = set( operation.targetSEList )

    # # { LFN: [ targetSE, ... ] }
    missingReplicas = {}

    scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) )
    # # get replicas
    replicas = FileCatalog().getReplicas( scheduledFiles.keys() )
    if not replicas["OK"]:
      self.log.error( replicas["Message"] )
      return replicas
    replicas = replicas["Value"]

    fullyReplicated = 0
    missingSEs = {}
    for successfulLFN in replicas["Successful"]:
      reps = set( replicas['Successful'][successfulLFN] )
      if targetSESet.issubset( reps ):
        log.verbose( "%s has been replicated to all targets" % successfulLFN )
        fullyReplicated += 1
        scheduledFiles[successfulLFN].Status = "Done"
      else:
        missingReplicas[successfulLFN] = sorted( targetSESet - reps )
        ses = ",".join( missingReplicas[ successfulLFN ] )
        missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1
        log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) )
    if fullyReplicated:
      log.info( "%d new files have been replicated to all targets" % fullyReplicated )
    if missingSEs:
      for ses in missingSEs:
        log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) )

    reMissing = re.compile( "no such file or directory" )
    for failedLFN, errStr in replicas["Failed"].iteritems():
      scheduledFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        log.error( "%s is missing, setting its status to 'Failed'" % failedLFN )
        scheduledFiles[failedLFN].Status = "Failed"
      else:
        log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) )

    return S_OK( missingReplicas )

  def __filterReplicas( self, opFile ):
    """ filter out banned/invalid source SEs """
    from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas
    return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )