示例#1
0
class TransformationAgent(AgentModule, TransformationAgentsUtilities):
    """ Usually subclass of AgentModule
  """
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)
        TransformationAgentsUtilities.__init__(self)

        # few parameters
        self.pluginLocation = ''
        self.transformationStatus = []
        self.maxFiles = 0
        self.transformationTypes = []

        # clients (out of the threads)
        self.transfClient = None

        # parameters for the threading
        self.transQueue = Queue.Queue()
        self.transInQueue = []

        # parameters for caching
        self.workDirectory = ''
        self.cacheFile = ''
        self.controlDirectory = ''

        self.lastFileOffset = {}
        # Validity of the cache
        self.replicaCache = None
        self.replicaCacheValidity = None
        self.writingCache = False
        self.removedFromCache = 0

        self.noUnusedDelay = 0
        self.unusedFiles = {}
        self.unusedTimeStamp = {}

        self.debug = False
        self.transInThread = {}
        self.pluginTimeout = {}

    def initialize(self):
        """ standard initialize
    """
        # few parameters
        self.pluginLocation = self.am_getOption(
            'PluginLocation',
            'DIRAC.TransformationSystem.Agent.TransformationPlugin')
        self.transformationStatus = self.am_getOption(
            'transformationStatus', ['Active', 'Completing', 'Flush'])
        # Prepare to change the name of the CS option as MaxFiles is ambiguous
        self.maxFiles = self.am_getOption('MaxFilesToProcess',
                                          self.am_getOption('MaxFiles', 5000))

        agentTSTypes = self.am_getOption('TransformationTypes', [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            dataProc = Operations().getValue('Transformations/DataProcessing',
                                             ['MCSimulation', 'Merge'])
            dataManip = Operations().getValue(
                'Transformations/DataManipulation', ['Replication', 'Removal'])
            self.transformationTypes = sorted(dataProc + dataManip)

        # clients
        self.transfClient = TransformationClient()

        # for caching using a pickle file
        self.workDirectory = self.am_getWorkDirectory()
        self.cacheFile = os.path.join(self.workDirectory, 'ReplicaCache.pkl')
        self.controlDirectory = self.am_getControlDirectory()

        # remember the offset if any in TS
        self.lastFileOffset = {}

        # Validity of the cache
        self.replicaCache = {}
        self.replicaCacheValidity = self.am_getOption('ReplicaCacheValidity',
                                                      2)

        self.noUnusedDelay = self.am_getOption('NoUnusedDelay', 6)

        # Get it threaded
        maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1)
        threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)
        self.log.info("Multithreaded with %d threads" % maxNumberOfThreads)

        for i in xrange(maxNumberOfThreads):
            threadPool.generateJobAndQueueIt(self._execute, [i])

        self.log.info("Will treat the following transformation types: %s" %
                      str(self.transformationTypes))

        return S_OK()

    def finalize(self):
        """ graceful finalization
    """
        method = 'finalize'
        if self.transInQueue:
            self.transInQueue = []
            self._logInfo(
                "Wait for threads to get empty before terminating the agent (%d tasks)"
                % len(self.transInThread),
                method=method)
            self._logInfo('Remaining transformations:',
                          ','.join(
                              str(transID) for transID in self.transInThread),
                          method=method)
            while self.transInThread:
                time.sleep(2)
            self._logInfo("Threads are empty, terminating the agent...",
                          method=method)
        self.__writeCache()
        return S_OK()

    def execute(self):
        """ Just puts transformations in the queue
    """
        # Get the transformations to process
        res = self.getTransformations()
        if not res['OK']:
            self._logError("Failed to obtain transformations:", res['Message'])
            return S_OK()
        # Process the transformations
        count = 0
        for transDict in res['Value']:
            transID = long(transDict['TransformationID'])
            if transDict.get('InheritedFrom'):
                # Try and move datasets from the ancestor production
                res = self.transfClient.moveFilesToDerivedTransformation(
                    transDict)
                if not res['OK']:
                    self._logError(
                        "Error moving files from an inherited transformation",
                        res['Message'],
                        transID=transID)
                else:
                    parentProd, movedFiles = res['Value']
                    if movedFiles:
                        self._logInfo(
                            "Successfully moved files from %d to %d:" %
                            (parentProd, transID),
                            transID=transID)
                        for status, val in movedFiles.iteritems():
                            self._logInfo("\t%d files to status %s" %
                                          (val, status),
                                          transID=transID)
            if transID not in self.transInQueue:
                count += 1
                self.transInQueue.append(transID)
                self.transQueue.put(transDict)
        self._logInfo("Out of %d transformations, %d put in thread queue" %
                      (len(res['Value']), count))
        return S_OK()

    def getTransformations(self):
        """ Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
        only real thing in the execute()
    """
        transName = self.am_getOption('Transformation', 'All')
        method = 'getTransformations'
        if transName == 'All':
            self._logInfo("Getting all transformations%s, status %s." %
                          (' of type %s' % str(self.transformationTypes)
                           if self.transformationTypes else '',
                           str(self.transformationStatus)),
                          method=method)
            transfDict = {'Status': self.transformationStatus}
            if self.transformationTypes:
                transfDict['Type'] = self.transformationTypes
            res = self.transfClient.getTransformations(transfDict,
                                                       extraParams=True)
            if not res['OK']:
                return res
            transformations = res['Value']
            self._logInfo("Obtained %d transformations to process" %
                          len(transformations),
                          method=method)
        else:
            self._logInfo("Getting transformation %s." % transName,
                          method=method)
            res = self.transfClient.getTransformation(transName,
                                                      extraParams=True)
            if not res['OK']:
                self._logError("Failed to get transformation:",
                               res['Message'],
                               method=method)
                return res
            transformations = [res['Value']]
        return S_OK(transformations)

    def _getClients(self):
        """ returns the clients used in the threads
    """
        threadTransformationClient = TransformationClient()
        threadDataManager = DataManager()

        return {
            'TransformationClient': threadTransformationClient,
            'DataManager': threadDataManager
        }

    def _execute(self, threadID):
        """ thread - does the real job: processing the transformations to be processed
    """

        # Each thread will have its own clients
        clients = self._getClients()

        while True:
            transDict = self.transQueue.get()
            try:
                transID = long(transDict['TransformationID'])
                if transID not in self.transInQueue:
                    break
                self.transInThread[transID] = ' [Thread%d] [%s] ' % (
                    threadID, str(transID))
                self._logInfo("Processing transformation %s." % transID,
                              transID=transID)
                startTime = time.time()
                res = self.processTransformation(transDict, clients)
                if not res['OK']:
                    self._logInfo("Failed to process transformation:",
                                  res['Message'],
                                  transID=transID)
            except Exception as x:  # pylint: disable=broad-except
                self._logException('Exception in plugin',
                                   lException=x,
                                   transID=transID)
            finally:
                if not transID:
                    transID = 'None'
                self._logInfo("Processed transformation in %.1f seconds" %
                              (time.time() - startTime),
                              transID=transID)
                if transID in self.transInQueue:
                    self.transInQueue.remove(transID)
                self.transInThread.pop(transID, None)
                self._logVerbose("%d transformations still in queue" %
                                 len(self.transInQueue))
        return S_OK()

    def processTransformation(self, transDict, clients):
        """ process a single transformation (in transDict)
    """
        method = 'processTransformation'
        transID = transDict['TransformationID']
        forJobs = transDict['Type'].lower() not in ('replication', 'removal')

        # First get the LFNs associated to the transformation
        transFiles = self._getTransformationFiles(
            transDict, clients, replicateOrRemove=not forJobs)
        if not transFiles['OK']:
            return transFiles
        if not transFiles['Value']:
            return S_OK()

        if transID not in self.replicaCache:
            self.__readCache(transID)
        transFiles = transFiles['Value']
        unusedLfns = [f['LFN'] for f in transFiles]
        unusedFiles = len(unusedLfns)

        plugin = transDict.get('Plugin', 'Standard')
        # Limit the number of LFNs to be considered for replication or removal as they are treated individually
        if not forJobs:
            maxFiles = Operations().getValue(
                'TransformationPlugins/%s/MaxFilesToProcess' % plugin, 0)
            # Get plugin-specific limit in number of files (0 means no limit)
            totLfns = len(unusedLfns)
            lfnsToProcess = self.__applyReduction(unusedLfns,
                                                  maxFiles=maxFiles)
            if len(lfnsToProcess) != totLfns:
                self._logInfo("Reduced number of files from %d to %d" %
                              (totLfns, len(lfnsToProcess)),
                              method=method,
                              transID=transID)
                transFiles = [
                    f for f in transFiles if f['LFN'] in lfnsToProcess
                ]
        else:
            lfnsToProcess = unusedLfns

        # Check the data is available with replicas
        res = self.__getDataReplicas(transDict,
                                     lfnsToProcess,
                                     clients,
                                     forJobs=forJobs)
        if not res['OK']:
            self._logError("Failed to get data replicas:",
                           res['Message'],
                           method=method,
                           transID=transID)
            return res
        dataReplicas = res['Value']

        # Get the plug-in type and create the plug-in object
        self._logInfo("Processing transformation with '%s' plug-in." % plugin,
                      method=method,
                      transID=transID)
        res = self.__generatePluginObject(plugin, clients)
        if not res['OK']:
            return res
        oPlugin = res['Value']

        # Get the plug-in and set the required params
        oPlugin.setParameters(transDict)
        oPlugin.setInputData(dataReplicas)
        oPlugin.setTransformationFiles(transFiles)
        res = oPlugin.run()
        if not res['OK']:
            self._logError("Failed to generate tasks for transformation:",
                           res['Message'],
                           method=method,
                           transID=transID)
            return res
        tasks = res['Value']
        self.pluginTimeout[transID] = res.get('Timeout', False)
        # Create the tasks
        allCreated = True
        created = 0
        lfnsInTasks = []
        for se, lfns in tasks:
            res = clients['TransformationClient'].addTaskForTransformation(
                transID, lfns, se)
            if not res['OK']:
                self._logError("Failed to add task generated by plug-in:",
                               res['Message'],
                               method=method,
                               transID=transID)
                allCreated = False
            else:
                created += 1
                lfnsInTasks += [lfn for lfn in lfns if lfn in lfnsToProcess]
        if created:
            self._logInfo("Successfully created %d tasks for transformation." %
                          created,
                          method=method,
                          transID=transID)
        else:
            self._logInfo("No new tasks created for transformation.",
                          method=method,
                          transID=transID)
        self.unusedFiles[transID] = unusedFiles - len(lfnsInTasks)
        # If not all files were obtained, move the offset
        lastOffset = self.lastFileOffset.get(transID)
        if lastOffset:
            self.lastFileOffset[transID] = max(0,
                                               lastOffset - len(lfnsInTasks))
        self.__removeFilesFromCache(transID, lfnsInTasks)

        # If this production is to Flush
        if transDict['Status'] == 'Flush' and allCreated:
            res = clients['TransformationClient'].setTransformationParameter(
                transID, 'Status', 'Active')
            if not res['OK']:
                self._logError(
                    "Failed to update transformation status to 'Active':",
                    res['Message'],
                    method=method,
                    transID=transID)
            else:
                self._logInfo("Updated transformation status to 'Active'.",
                              method=method,
                              transID=transID)
        return S_OK()

    ######################################################################
    #
    # Internal methods used by the agent
    #

    def _getTransformationFiles(self,
                                transDict,
                                clients,
                                statusList=None,
                                replicateOrRemove=False):
        """ get the data replicas for a certain transID
    """
        # By default, don't skip if no new Unused for DM transformations
        skipIfNoNewUnused = not replicateOrRemove
        transID = transDict['TransformationID']
        plugin = transDict.get('Plugin', 'Standard')
        # Check if files should be sorted and limited in number
        operations = Operations()
        sortedBy = operations.getValue(
            'TransformationPlugins/%s/SortedBy' % plugin, None)
        maxFiles = operations.getValue(
            'TransformationPlugins/%s/MaxFilesToProcess' % plugin, 0)
        # If the NoUnuse delay is explicitly set, we want to take it into account, and skip if no new Unused
        if operations.getValue(
                'TransformationPlugins/%s/NoUnusedDelay' % plugin, 0):
            skipIfNoNewUnused = True
        noUnusedDelay = 0 if self.pluginTimeout.get(transID, False) else \
            operations.getValue('TransformationPlugins/%s/NoUnusedDelay' % plugin, self.noUnusedDelay)
        method = '_getTransformationFiles'
        lastOffset = self.lastFileOffset.setdefault(transID, 0)

        # Files that were problematic (either explicit or because SE was banned) may be recovered,
        # and always removing the missing ones
        if not statusList:
            statusList = ['Unused', 'ProbInFC']
        statusList += ['MissingInFC'] if transDict['Type'] == 'Removal' else []
        transClient = clients['TransformationClient']
        res = transClient.getTransformationFiles(condDict={
            'TransformationID': transID,
            'Status': statusList
        },
                                                 orderAttribute=sortedBy,
                                                 offset=lastOffset,
                                                 maxfiles=maxFiles)
        if not res['OK']:
            self._logError("Failed to obtain input data:",
                           res['Message'],
                           method=method,
                           transID=transID)
            return res
        transFiles = res['Value']
        if maxFiles and len(transFiles) == maxFiles:
            self.lastFileOffset[transID] += maxFiles
        else:
            del self.lastFileOffset[transID]

        if not transFiles:
            self._logInfo("No '%s' files found for transformation." %
                          ','.join(statusList),
                          method=method,
                          transID=transID)
            if transDict['Status'] == 'Flush':
                res = transClient.setTransformationParameter(
                    transID, 'Status', 'Active')
                if not res['OK']:
                    self._logError(
                        "Failed to update transformation status to 'Active':",
                        res['Message'],
                        method=method,
                        transID=transID)
                else:
                    self._logInfo("Updated transformation status to 'Active'.",
                                  method=method,
                                  transID=transID)
            return S_OK()
        # Check if transformation is kicked
        kickFile = os.path.join(self.controlDirectory,
                                'KickTransformation_%s' % str(transID))
        try:
            kickTrans = os.path.exists(kickFile)
            if kickTrans:
                os.remove(kickFile)
        except OSError:
            pass

        # Check if something new happened
        now = datetime.datetime.utcnow()
        if not kickTrans and skipIfNoNewUnused and noUnusedDelay:
            nextStamp = self.unusedTimeStamp.setdefault(
                transID, now) + datetime.timedelta(hours=noUnusedDelay)
            skip = now < nextStamp
            if len(transFiles) == self.unusedFiles.get(
                    transID, 0) and transDict['Status'] != 'Flush' and skip:
                self._logInfo("No new '%s' files found for transformation." %
                              ','.join(statusList),
                              method=method,
                              transID=transID)
                return S_OK()

        self.unusedTimeStamp[transID] = now
        # If files are not Unused, set them Unused
        notUnused = [
            trFile['LFN'] for trFile in transFiles
            if trFile['Status'] != 'Unused'
        ]
        otherStatuses = sorted(
            set([trFile['Status'] for trFile in transFiles]) - set(['Unused']))
        if notUnused:
            res = transClient.setFileStatusForTransformation(transID,
                                                             'Unused',
                                                             notUnused,
                                                             force=True)
            if not res['OK']:
                self._logError("Error setting %d files Unused:" %
                               len(notUnused),
                               res['Message'],
                               method=method,
                               transID=transID)
            else:
                self._logInfo("Set %d files from %s to Unused" %
                              (len(notUnused), ','.join(otherStatuses)))
                self.__removeFilesFromCache(transID, notUnused)
        return S_OK(transFiles)

    def __applyReduction(self, lfns, maxFiles=None):
        """ eventually remove the number of files to be considered
    """
        if maxFiles is None:
            maxFiles = self.maxFiles
        if not maxFiles or len(lfns) <= maxFiles:
            return lfns
        return randomize(lfns)[:maxFiles]

    def __getDataReplicas(self, transDict, lfns, clients, forJobs=True):
        """ Get the replicas for the LFNs and check their statuses. It first looks within the cache.
    """
        method = '__getDataReplicas'
        transID = transDict['TransformationID']
        if 'RemoveFile' in transDict['Body']:
            # When removing files, we don't care about their replicas
            return S_OK(dict.fromkeys(lfns, ['None']))
        clearCacheFile = os.path.join(self.controlDirectory,
                                      'ClearCache_%s' % str(transID))
        try:
            clearCache = os.path.exists(clearCacheFile)
            if clearCache:
                os.remove(clearCacheFile)
        except:
            pass
        if clearCache or transDict['Status'] == 'Flush':
            self._logInfo("Replica cache cleared",
                          method=method,
                          transID=transID)
            # We may need to get new replicas
            self.__clearCacheForTrans(transID)
        else:
            # If the cache needs to be cleaned
            self.__cleanCache(transID)
        startTime = time.time()
        dataReplicas = {}
        nLfns = len(lfns)
        self._logVerbose("Getting replicas for %d files" % nLfns,
                         method=method,
                         transID=transID)
        cachedReplicaSets = self.replicaCache.get(transID, {})
        cachedReplicas = {}
        # Merge all sets of replicas
        for replicas in cachedReplicaSets.itervalues():
            cachedReplicas.update(replicas)
        self._logInfo("Number of cached replicas: %d" % len(cachedReplicas),
                      method=method,
                      transID=transID)
        setCached = set(cachedReplicas)
        setLfns = set(lfns)
        for lfn in setLfns & setCached:
            dataReplicas[lfn] = cachedReplicas[lfn]
        newLFNs = setLfns - setCached
        self._logInfo("ReplicaCache hit for %d out of %d LFNs" %
                      (len(dataReplicas), nLfns),
                      method=method,
                      transID=transID)
        if newLFNs:
            startTime = time.time()
            self._logInfo("Getting replicas for %d files from catalog" %
                          len(newLFNs),
                          method=method,
                          transID=transID)
            newReplicas = {}
            for chunk in breakListIntoChunks(newLFNs, 10000):
                res = self._getDataReplicasDM(transID,
                                              chunk,
                                              clients,
                                              forJobs=forJobs)
                if res['OK']:
                    reps = dict((lfn, ses)
                                for lfn, ses in res['Value'].iteritems()
                                if ses)
                    newReplicas.update(reps)
                    self.__updateCache(transID, reps)
                else:
                    self._logWarn("Failed to get replicas for %d files" %
                                  len(chunk),
                                  res['Message'],
                                  method=method,
                                  transID=transID)

            self._logInfo("Obtained %d replicas from catalog in %.1f seconds" %
                          (len(newReplicas), time.time() - startTime),
                          method=method,
                          transID=transID)
            dataReplicas.update(newReplicas)
            noReplicas = newLFNs - set(dataReplicas)
            self.__writeCache(transID)
            if noReplicas:
                self._logWarn(
                    "Found %d files without replicas (or only in Failover)" %
                    len(noReplicas),
                    method=method,
                    transID=transID)
        return S_OK(dataReplicas)

    def _getDataReplicasDM(self,
                           transID,
                           lfns,
                           clients,
                           forJobs=True,
                           ignoreMissing=False):
        """ Get the replicas for the LFNs and check their statuses, using the replica manager
    """
        method = '_getDataReplicasDM'

        startTime = time.time()
        self._logVerbose("Getting replicas%s from catalog for %d files" %
                         (' for jobs' if forJobs else '', len(lfns)),
                         method=method,
                         transID=transID)
        if forJobs:
            # Get only replicas eligible for jobs
            res = clients['DataManager'].getReplicasForJobs(lfns, getUrl=False)
        else:
            # Get all replicas
            res = clients['DataManager'].getReplicas(lfns, getUrl=False)
        if not res['OK']:
            return res
        replicas = res['Value']
        # Prepare a dictionary for all LFNs
        dataReplicas = {}
        self._logVerbose(
            "Replica results for %d files obtained in %.2f seconds" %
            (len(lfns), time.time() - startTime),
            method=method,
            transID=transID)
        # If files are neither Successful nor Failed, they are set problematic in the FC
        problematicLfns = [
            lfn for lfn in lfns if lfn not in replicas['Successful']
            and lfn not in replicas['Failed']
        ]
        if problematicLfns:
            self._logInfo(
                "%d files found problematic in the catalog, set ProbInFC" %
                len(problematicLfns))
            res = clients[
                'TransformationClient'].setFileStatusForTransformation(
                    transID, 'ProbInFC', problematicLfns)
            if not res['OK']:
                self._logError("Failed to update status of problematic files:",
                               res['Message'],
                               method=method,
                               transID=transID)
        # Create a dictionary containing all the file replicas
        failoverLfns = []
        for lfn, replicaDict in replicas['Successful'].iteritems():
            for se in replicaDict:
                # This sremains here for backward compatibility in case VOs have not defined SEs not to be used for jobs
                if forJobs and 'failover' in se.lower():
                    self._logVerbose("Ignoring failover replica for %s." % lfn,
                                     method=method,
                                     transID=transID)
                else:
                    dataReplicas.setdefault(lfn, []).append(se)
            if not dataReplicas.get(lfn):
                failoverLfns.append(lfn)
        if failoverLfns:
            self._logVerbose(
                "%d files have no replica but possibly in Failover SE" %
                len(failoverLfns))
        # Make sure that file missing from the catalog are marked in the transformation DB.
        missingLfns = []
        for lfn, reason in replicas['Failed'].iteritems():
            if "No such file or directory" in reason:
                self._logVerbose("%s not found in the catalog." % lfn,
                                 method=method,
                                 transID=transID)
                missingLfns.append(lfn)
        if missingLfns:
            self._logInfo("%d files not found in the catalog" %
                          len(missingLfns))
            if ignoreMissing:
                dataReplicas.update(dict.fromkeys(missingLfns, []))
            else:
                res = clients[
                    'TransformationClient'].setFileStatusForTransformation(
                        transID, 'MissingInFC', missingLfns)
                if not res['OK']:
                    self._logError("Failed to update status of missing files:",
                                   res['Message'],
                                   method=method,
                                   transID=transID)
        return S_OK(dataReplicas)

    def __updateCache(self, transID, newReplicas):
        """ Add replicas to the cache
    """
        self.replicaCache.setdefault(
            transID, {})[datetime.datetime.utcnow()] = newReplicas


#    if len( newReplicas ) > 5000:
#      self.__writeCache( transID )

    def __clearCacheForTrans(self, transID):
        """ Remove all replicas for a transformation
    """
        self.replicaCache.pop(transID, None)

    def __cleanReplicas(self, transID, lfns):
        """ Remove cached replicas that are not in a list
    """
        cachedReplicas = set()
        for replicas in self.replicaCache.get(transID, {}).itervalues():
            cachedReplicas.update(replicas)
        toRemove = cachedReplicas - set(lfns)
        if toRemove:
            self._logInfo("Remove %d files from cache" % len(toRemove),
                          method='__cleanReplicas',
                          transID=transID)
            self.__removeFromCache(transID, toRemove)

    def __cleanCache(self, transID):
        """ Cleans the cache
    """
        try:
            if transID in self.replicaCache:
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    days=self.replicaCacheValidity)
                for updateTime in set(self.replicaCache[transID]):
                    nCache = len(self.replicaCache[transID][updateTime])
                    if updateTime < timeLimit or not nCache:
                        self._logInfo(
                            "Clear %s replicas for transformation %s, time %s"
                            %
                            ('%d cached' % nCache if nCache else 'empty cache',
                             str(transID), str(updateTime)),
                            transID=transID,
                            method='__cleanCache')
                        del self.replicaCache[transID][updateTime]
                # Remove empty transformations
                if not self.replicaCache[transID]:
                    del self.replicaCache[transID]
        except Exception as x:
            self._logException("Exception when cleaning replica cache:",
                               lException=x)

    def __removeFilesFromCache(self, transID, lfns):
        removed = self.__removeFromCache(transID, lfns)
        if removed:
            self._logInfo("Removed %d replicas from cache" % removed,
                          method='__removeFilesFromCache',
                          transID=transID)
            self.__writeCache(transID)

    def __removeFromCache(self, transID, lfns):
        if transID not in self.replicaCache:
            return
        removed = 0
        if self.replicaCache[transID] and lfns:
            for lfn in lfns:
                for timeKey in self.replicaCache[transID]:
                    if self.replicaCache[transID][timeKey].pop(lfn, None):
                        removed += 1
        return removed

    def __cacheFile(self, transID):
        return self.cacheFile.replace('.pkl', '_%s.pkl' % str(transID))

    @gSynchro
    def __readCache(self, transID):
        """ Reads from the cache
    """
        if transID in self.replicaCache:
            return
        try:
            method = '__readCache'
            fileName = self.__cacheFile(transID)
            if not os.path.exists(fileName):
                self.replicaCache[transID] = {}
            else:
                with open(fileName, 'r') as cacheFile:
                    self.replicaCache[transID] = pickle.load(cacheFile)
                self._logInfo(
                    "Successfully loaded replica cache from file %s (%d files)"
                    % (fileName, self.__filesInCache(transID)),
                    method=method,
                    transID=transID)
        except Exception as x:
            self._logException("Failed to load replica cache from file %s" %
                               fileName,
                               lException=x,
                               method=method,
                               transID=transID)
            self.replicaCache[transID] = {}

    def __filesInCache(self, transID):
        cache = self.replicaCache.get(transID, {})
        return sum(len(lfns) for lfns in cache.itervalues())

    @gSynchro
    def __writeCache(self, transID=None):
        """ Writes the cache
    """
        method = '__writeCache'
        try:
            startTime = time.time()
            transList = [transID] if transID else set(self.replicaCache)
            filesInCache = 0
            nCache = 0
            for t_id in transList:
                # Protect the copy of the cache
                filesInCache += self.__filesInCache(t_id)
                # write to a temporary file in order to avoid corrupted files
                cacheFile = self.__cacheFile(t_id)
                tmpFile = cacheFile + '.tmp'
                with open(tmpFile, 'w') as fd:
                    pickle.dump(self.replicaCache.get(t_id, {}), fd)
                # Now rename the file as it shold
                os.rename(tmpFile, cacheFile)
                nCache += 1
            self._logInfo(
                "Successfully wrote %d replica cache file(s) (%d files) in %.1f seconds"
                % (nCache, filesInCache, time.time() - startTime),
                method=method,
                transID=transID if transID else None)
        except Exception as x:
            self._logException("Could not write replica cache file %s" %
                               cacheFile,
                               lException=x,
                               method=method,
                               transID=t_id)

    def __generatePluginObject(self, plugin, clients):
        """ This simply instantiates the TransformationPlugin class with the relevant plugin name
    """
        try:
            plugModule = __import__(self.pluginLocation, globals(), locals(),
                                    ['TransformationPlugin'])
        except ImportError as e:
            self._logException("Failed to import 'TransformationPlugin' %s" %
                               plugin,
                               lException=e,
                               method="__generatePluginObject")
            return S_ERROR()
        try:
            plugin_o = getattr(plugModule, 'TransformationPlugin')(
                '%s' % plugin,
                transClient=clients['TransformationClient'],
                dataManager=clients['DataManager'])
            return S_OK(plugin_o)
        except AttributeError as e:
            self._logException("Failed to create %s()" % plugin,
                               lException=e,
                               method="__generatePluginObject")
            return S_ERROR()
        plugin_o.setDirectory(self.workDirectory)
        plugin_o.setCallback(self.pluginCallback)

    def pluginCallback(self, transID, invalidateCache=False):
        """ Standard plugin callback
    """
        if invalidateCache:
            try:
                if transID in self.replicaCache:
                    self._logInfo("Removed cached replicas for transformation",
                                  method='pluginCallBack',
                                  transID=transID)
                    self.replicaCache.pop(transID)
                    self.__writeCache(transID)
            except:
                pass
示例#2
0
class TransformationAgent(AgentModule, TransformationAgentsUtilities):
    """ Usually subclass of AgentModule
  """
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)
        TransformationAgentsUtilities.__init__(self)

        # few parameters
        self.pluginLocation = ''
        self.transformationStatus = []
        self.maxFiles = 0
        self.transformationTypes = []

        # clients (out of the threads)
        self.transfClient = None

        # parameters for the threading
        self.transQueue = Queue.Queue()
        self.transInQueue = []

        # parameters for caching
        self.workDirectory = ''
        self.cacheFile = ''
        self.controlDirectory = ''

        self.lastFileOffset = {}
        # Validity of the cache
        self.replicaCache = None
        self.replicaCacheValidity = None
        self.writingCache = False
        self.removedFromCache = 0

        self.noUnusedDelay = 0
        self.unusedFiles = {}
        self.unusedTimeStamp = {}

        self.debug = False
        self.transInThread = {}
        self.pluginTimeout = {}

    def initialize(self):
        """ standard initialize
    """
        # few parameters
        self.pluginLocation = self.am_getOption(
            'PluginLocation',
            'DIRAC.TransformationSystem.Agent.TransformationPlugin')
        self.transformationStatus = self.am_getOption(
            'transformationStatus', ['Active', 'Completing', 'Flush'])
        self.maxFiles = self.am_getOption('MaxFiles', 5000)

        agentTSTypes = self.am_getOption('TransformationTypes', [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            dataProc = Operations().getValue('Transformations/DataProcessing',
                                             ['MCSimulation', 'Merge'])
            dataManip = Operations().getValue(
                'Transformations/DataManipulation', ['Replication', 'Removal'])
            self.transformationTypes = sorted(dataProc + dataManip)

        # clients
        self.transfClient = TransformationClient()

        # for caching using a pickle file
        self.workDirectory = self.am_getWorkDirectory()
        self.cacheFile = os.path.join(self.workDirectory, 'ReplicaCache.pkl')
        self.controlDirectory = self.am_getControlDirectory()

        # remember the offset if any in TS
        self.lastFileOffset = {}

        # Validity of the cache
        self.replicaCache = {}
        self.replicaCacheValidity = self.am_getOption('ReplicaCacheValidity',
                                                      2)

        self.noUnusedDelay = self.am_getOption('NoUnusedDelay', 6)

        # Get it threaded
        maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1)
        threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)
        self.log.info("Multithreaded with %d threads" % maxNumberOfThreads)

        for i in xrange(maxNumberOfThreads):
            threadPool.generateJobAndQueueIt(self._execute, [i])

        self.log.info("Will treat the following transformation types: %s" %
                      str(self.transformationTypes))

        return S_OK()

    def finalize(self):
        """ graceful finalization
    """
        method = 'finalize'
        if self.transInQueue:
            self.transInQueue = []
            self._logInfo(
                "Wait for threads to get empty before terminating the agent (%d tasks)"
                % len(self.transInThread),
                method=method)
            self._logInfo(
                'Remaining transformations: ' +
                ','.join([str(transID) for transID in self.transInThread]),
                method=method)
            while self.transInThread:
                time.sleep(2)
            self._logInfo("Threads are empty, terminating the agent...",
                          method=method)
        self.__writeCache()
        return S_OK()

    def execute(self):
        """ Just puts transformations in the queue
    """
        # Get the transformations to process
        res = self.getTransformations()
        if not res['OK']:
            self._logError("Failed to obtain transformations:", res['Message'])
            return S_OK()
        # Process the transformations
        count = 0
        for transDict in res['Value']:
            transID = long(transDict['TransformationID'])
            if transDict.get('InheritedFrom'):
                # Try and move datasets from the ancestor production
                res = self.transfClient.moveFilesToDerivedTransformation(
                    transDict)
                if not res['OK']:
                    self._logError(
                        "Error moving files from an inherited transformation",
                        res['Message'],
                        transID=transID)
                else:
                    parentProd, movedFiles = res['Value']
                    if movedFiles:
                        self._logInfo(
                            "Successfully moved files from %d to %d:" %
                            (parentProd, transID),
                            transID=transID)
                        for status, val in movedFiles.items():
                            self._logInfo("\t%d files to status %s" %
                                          (val, status),
                                          transID=transID)
            if transID not in self.transInQueue:
                count += 1
                self.transInQueue.append(transID)
                self.transQueue.put(transDict)
        self._logInfo("Out of %d transformations, %d put in thread queue" %
                      (len(res['Value']), count))
        return S_OK()

    def getTransformations(self):
        """ Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
        only real thing in the execute()
    """
        transName = self.am_getOption('Transformation', 'All')
        method = 'getTransformations'
        if transName == 'All':
            self._logInfo("Getting all transformations%s, status %s." %
                          (' of type %s' % str(self.transformationTypes)
                           if self.transformationTypes else '',
                           str(self.transformationStatus)),
                          method=method)
            transfDict = {'Status': self.transformationStatus}
            if self.transformationTypes:
                transfDict['Type'] = self.transformationTypes
            res = self.transfClient.getTransformations(transfDict,
                                                       extraParams=True)
            if not res['OK']:
                return res
            transformations = res['Value']
            self._logInfo("Obtained %d transformations to process" %
                          len(transformations),
                          method=method)
        else:
            self._logInfo("Getting transformation %s." % transName,
                          method=method)
            res = self.transfClient.getTransformation(transName,
                                                      extraParams=True)
            if not res['OK']:
                self._logError("Failed to get transformation:",
                               res['Message'],
                               method=method)
                return res
            transformations = [res['Value']]
        return S_OK(transformations)

    def _getClients(self):
        """ returns the clients used in the threads
    """
        threadTransformationClient = TransformationClient()
        threadDataManager = DataManager()

        return {
            'TransformationClient': threadTransformationClient,
            'DataManager': threadDataManager
        }

    def _execute(self, threadID):
        """ thread - does the real job: processing the transformations to be processed
    """

        # Each thread will have its own clients
        clients = self._getClients()

        while True:
            transDict = self.transQueue.get()
            try:
                transID = long(transDict['TransformationID'])
                if transID not in self.transInQueue:
                    break
                self.transInThread[transID] = ' [Thread%d] [%s] ' % (
                    threadID, str(transID))
                self._logInfo("Processing transformation %s." % transID,
                              transID=transID)
                startTime = time.time()
                res = self.processTransformation(transDict, clients)
                if not res['OK']:
                    self._logInfo("Failed to process transformation:",
                                  res['Message'],
                                  transID=transID)
            except Exception, x:
                self._logException('%s' % x, transID=transID)
            finally:
示例#3
0
class TransformationAgent( AgentModule, TransformationAgentsUtilities ):
  """ Usually subclass of AgentModule
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )
    TransformationAgentsUtilities.__init__( self )

    #few parameters
    self.pluginLocation = self.am_getOption( 'PluginLocation',
                                             'DIRAC.TransformationSystem.Agent.TransformationPlugin' )
    self.transformationStatus = self.am_getOption( 'transformationStatus', ['Active', 'Completing', 'Flush'] )
    self.maxFiles = self.am_getOption( 'MaxFiles', 5000 )

    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = sortList( agentTSTypes )
    else:
      dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
      dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] )
      self.transformationTypes = sortList( dataProc + dataManip )

    #clients
    self.transfClient = TransformationClient()

    #for the threading
    self.transQueue = Queue.Queue()
    self.transInQueue = []

    #for caching using a pickle file
    self.workDirectory = self.am_getWorkDirectory()
    self.cacheFile = os.path.join( self.workDirectory, 'ReplicaCache.pkl' )
    self.dateWriteCache = datetime.datetime.utcnow()

    # Validity of the cache
    self.replicaCache = None
    self.replicaCacheValidity = self.am_getOption( 'ReplicaCacheValidity', 2 )
    self.writingCache = False

    self.noUnusedDelay = self.am_getOption( 'NoUnusedDelay', 6 )
    self.unusedFiles = {}
    self.unusedTimeStamp = {}

    self.debug = False
    self.transInThread = {}

  def initialize( self ):
    """ standard initialize
    """

    self.__readCache()
    self.dateWriteCache = datetime.datetime.utcnow()

    self.am_setOption( 'shifterProxy', 'ProductionManager' )

    # Get it threaded
    maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 )
    threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads )
    self.log.info( "Multithreaded with %d threads" % maxNumberOfThreads )

    for i in xrange( maxNumberOfThreads ):
      threadPool.generateJobAndQueueIt( self._execute, [i] )

    self.log.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) )

    return S_OK()

  def finalize( self ):
    """ graceful finalization
    """
    if self.transInQueue:
      self._logInfo( "Wait for threads to get empty before terminating the agent (%d tasks)" % len( self.transInThread ) )
      self.transInQueue = []
      while self.transInThread:
        time.sleep( 2 )
      self.log.info( "Threads are empty, terminating the agent..." )
    self.__writeCache( force = True )
    return S_OK()

  def execute( self ):
    """ Just puts transformations in the queue
    """
    # Get the transformations to process
    res = self.getTransformations()
    if not res['OK']:
      self._logError( "Failed to obtain transformations: %s" % ( res['Message'] ) )
      return S_OK()
    # Process the transformations
    count = 0
    for transDict in res['Value']:
      transID = long( transDict['TransformationID'] )
      if transDict.get( 'InheritedFrom' ):
        # Try and move datasets from the ancestor production
        res = self.transfClient.moveFilesToDerivedTransformation( transDict )
        if not res['OK']:
          self._logError( "Error moving files from an inherited transformation", res['Message'], transID = transID )
        else:
          parentProd, movedFiles = res['Value']
          if movedFiles:
            self._logInfo( "Successfully moved files from %d to %d:" % ( parentProd, transID ), transID = transID )
            for status, val in movedFiles.items():
              self._logInfo( "\t%d files to status %s" % ( val, status ), transID = transID )
      if transID not in self.transInQueue:
        count += 1
        self.transInQueue.append( transID )
        self.transQueue.put( transDict )
    self._logInfo( "Out of %d transformations, %d put in thread queue" % ( len( res['Value'] ), count ) )
    return S_OK()

  def getTransformations( self ):
    """ Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
        only real thing in the execute()
    """
    transName = self.am_getOption( 'Transformation', 'All' )
    if transName == 'All':
      self._logInfo( "Initializing general purpose agent.", method = 'getTransformations' )
      transfDict = {'Status': self.transformationStatus }
      if self.transformationTypes:
        transfDict['Type'] = self.transformationTypes
      res = self.transfClient.getTransformations( transfDict, extraParams = True )
      if not res['OK']:
        self._logError( "Failed to get transformations: %s" % res['Message'], method = 'getTransformations' )
        return res
      transformations = res['Value']
      self._logInfo( "Obtained %d transformations to process" % len( transformations ), method = 'getTransformations' )
    else:
      self._logInfo( "Initializing for transformation %s." % transName, method = "getTransformations" )
      res = self.transfClient.getTransformation( transName, extraParams = True )
      if not res['OK']:
        self._logError( "Failed to get transformation: %s." % res['Message'], method = 'getTransformations' )
        return res
      transformations = [res['Value']]
    return S_OK( transformations )

  def _getClients( self ):
    """ returns the clients used in the threads
    """
    threadTransformationClient = TransformationClient()
    threadReplicaManager = ReplicaManager()

    return {'TransformationClient': threadTransformationClient,
            'ReplicaManager': threadReplicaManager}

  def _execute( self, threadID ):
    """ thread - does the real job: processing the transformations to be processed
    """

    #Each thread will have its own clients
    clients = self._getClients()

    while True:
      transDict = self.transQueue.get()
      try:
        transID = long( transDict['TransformationID'] )
        if transID not in self.transInQueue:
          break
        self.transInThread[transID] = ' [Thread%d] [%s] ' % ( threadID, str( transID ) )
        self._logInfo( "Processing transformation %s." % transID, transID = transID )
        startTime = time.time()
        res = self.processTransformation( transDict, clients )
        if not res['OK']:
          self._logInfo( "Failed to process transformation: %s" % res['Message'], transID = transID )
      except Exception, x:
        self._logException( '%s' % x, transID = transID )
      finally:
示例#4
0
class TransformationAgent(AgentModule, TransformationAgentsUtilities):
  """ Usually subclass of AgentModule
  """

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)
    TransformationAgentsUtilities.__init__(self)

    # few parameters
    self.pluginLocation = ''
    self.transformationStatus = []
    self.maxFiles = 0
    self.transformationTypes = []

    # clients (out of the threads)
    self.transfClient = None

    # parameters for the threading
    self.transQueue = Queue.Queue()
    self.transInQueue = []

    # parameters for caching
    self.workDirectory = ''
    self.cacheFile = ''
    self.controlDirectory = ''

    self.lastFileOffset = {}
    # Validity of the cache
    self.replicaCache = None
    self.replicaCacheValidity = None
    self.writingCache = False
    self.removedFromCache = 0

    self.noUnusedDelay = 0
    self.unusedFiles = {}
    self.unusedTimeStamp = {}

    self.debug = False
    self.transInThread = {}
    self.pluginTimeout = {}

  def initialize(self):
    """ standard initialize
    """
    # few parameters
    self.pluginLocation = self.am_getOption('PluginLocation',
                                            'DIRAC.TransformationSystem.Agent.TransformationPlugin')
    self.transformationStatus = self.am_getOption('transformationStatus', ['Active', 'Completing', 'Flush'])
    # Prepare to change the name of the CS option as MaxFiles is ambiguous
    self.maxFiles = self.am_getOption('MaxFilesToProcess', self.am_getOption('MaxFiles', 5000))

    agentTSTypes = self.am_getOption('TransformationTypes', [])
    if agentTSTypes:
      self.transformationTypes = sorted(agentTSTypes)
    else:
      dataProc = Operations().getValue('Transformations/DataProcessing', ['MCSimulation', 'Merge'])
      dataManip = Operations().getValue('Transformations/DataManipulation', ['Replication', 'Removal'])
      self.transformationTypes = sorted(dataProc + dataManip)

    # clients
    self.transfClient = TransformationClient()

    # for caching using a pickle file
    self.workDirectory = self.am_getWorkDirectory()
    self.cacheFile = os.path.join(self.workDirectory, 'ReplicaCache.pkl')
    self.controlDirectory = self.am_getControlDirectory()

    # remember the offset if any in TS
    self.lastFileOffset = {}

    # Validity of the cache
    self.replicaCache = {}
    self.replicaCacheValidity = self.am_getOption('ReplicaCacheValidity', 2)

    self.noUnusedDelay = self.am_getOption('NoUnusedDelay', 6)

    # Get it threaded
    maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1)
    threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)
    self.log.info("Multithreaded with %d threads" % maxNumberOfThreads)

    for i in xrange(maxNumberOfThreads):
      threadPool.generateJobAndQueueIt(self._execute, [i])

    self.log.info("Will treat the following transformation types: %s" % str(self.transformationTypes))

    return S_OK()

  def finalize(self):
    """ graceful finalization
    """
    method = 'finalize'
    if self.transInQueue:
      self.transInQueue = []
      self._logInfo("Wait for threads to get empty before terminating the agent (%d tasks)" %
                    len(self.transInThread), method=method)
      self._logInfo('Remaining transformations:',
                    ','.join(str(transID) for transID in self.transInThread), method=method)
      while self.transInThread:
        time.sleep(2)
      self._logInfo("Threads are empty, terminating the agent...", method=method)
    self.__writeCache()
    return S_OK()

  def execute(self):
    """ Just puts transformations in the queue
    """
    # Get the transformations to process
    res = self.getTransformations()
    if not res['OK']:
      self._logError("Failed to obtain transformations:", res['Message'])
      return S_OK()
    # Process the transformations
    count = 0
    for transDict in res['Value']:
      transID = long(transDict['TransformationID'])
      if transDict.get('InheritedFrom'):
        # Try and move datasets from the ancestor production
        res = self.transfClient.moveFilesToDerivedTransformation(transDict)
        if not res['OK']:
          self._logError("Error moving files from an inherited transformation", res['Message'], transID=transID)
        else:
          parentProd, movedFiles = res['Value']
          if movedFiles:
            self._logInfo("Successfully moved files from %d to %d:" % (parentProd, transID), transID=transID)
            for status, val in movedFiles.iteritems():
              self._logInfo("\t%d files to status %s" % (val, status), transID=transID)
      if transID not in self.transInQueue:
        count += 1
        self.transInQueue.append(transID)
        self.transQueue.put(transDict)
    self._logInfo("Out of %d transformations, %d put in thread queue" % (len(res['Value']), count))
    return S_OK()

  def getTransformations(self):
    """ Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
        only real thing in the execute()
    """
    transName = self.am_getOption('Transformation', 'All')
    method = 'getTransformations'
    if transName == 'All':
      self._logInfo("Getting all transformations%s, status %s." %
                    (' of type %s' % str(self.transformationTypes) if self.transformationTypes else '',
                     str(self.transformationStatus)),
                    method=method)
      transfDict = {'Status': self.transformationStatus}
      if self.transformationTypes:
        transfDict['Type'] = self.transformationTypes
      res = self.transfClient.getTransformations(transfDict, extraParams=True)
      if not res['OK']:
        return res
      transformations = res['Value']
      self._logInfo("Obtained %d transformations to process" % len(transformations), method=method)
    else:
      self._logInfo("Getting transformation %s." % transName, method=method)
      res = self.transfClient.getTransformation(transName, extraParams=True)
      if not res['OK']:
        self._logError("Failed to get transformation:", res['Message'], method=method)
        return res
      transformations = [res['Value']]
    return S_OK(transformations)

  def _getClients(self):
    """ returns the clients used in the threads
    """
    threadTransformationClient = TransformationClient()
    threadDataManager = DataManager()

    return {'TransformationClient': threadTransformationClient,
            'DataManager': threadDataManager}

  def _execute(self, threadID):
    """ thread - does the real job: processing the transformations to be processed
    """

    # Each thread will have its own clients
    clients = self._getClients()

    while True:
      transDict = self.transQueue.get()
      try:
        transID = long(transDict['TransformationID'])
        if transID not in self.transInQueue:
          break
        self.transInThread[transID] = ' [Thread%d] [%s] ' % (threadID, str(transID))
        self._logInfo("Processing transformation %s." % transID, transID=transID)
        startTime = time.time()
        res = self.processTransformation(transDict, clients)
        if not res['OK']:
          self._logInfo("Failed to process transformation:", res['Message'], transID=transID)
      except Exception as x:  # pylint: disable=broad-except
        self._logException('Exception in plugin', lException=x, transID=transID)
      finally:
        if not transID:
          transID = 'None'
        self._logInfo("Processed transformation in %.1f seconds" % (time.time() - startTime), transID=transID)
        if transID in self.transInQueue:
          self.transInQueue.remove(transID)
        self.transInThread.pop(transID, None)
        self._logVerbose("%d transformations still in queue" % len(self.transInQueue))
    return S_OK()

  def processTransformation(self, transDict, clients):
    """ process a single transformation (in transDict)
    """
    method = 'processTransformation'
    transID = transDict['TransformationID']
    forJobs = transDict['Type'].lower() not in ('replication', 'removal')

    # First get the LFNs associated to the transformation
    transFiles = self._getTransformationFiles(transDict, clients, replicateOrRemove=not forJobs)
    if not transFiles['OK']:
      return transFiles
    if not transFiles['Value']:
      return S_OK()

    if transID not in self.replicaCache:
      self.__readCache(transID)
    transFiles = transFiles['Value']
    unusedLfns = [f['LFN'] for f in transFiles]
    unusedFiles = len(unusedLfns)

    plugin = transDict.get('Plugin', 'Standard')
    # Limit the number of LFNs to be considered for replication or removal as they are treated individually
    if not forJobs:
      maxFiles = Operations().getValue('TransformationPlugins/%s/MaxFilesToProcess' % plugin, 0)
      # Get plugin-specific limit in number of files (0 means no limit)
      totLfns = len(unusedLfns)
      lfnsToProcess = self.__applyReduction(unusedLfns, maxFiles=maxFiles)
      if len(lfnsToProcess) != totLfns:
        self._logInfo("Reduced number of files from %d to %d" % (totLfns, len(lfnsToProcess)),
                      method=method, transID=transID)
        transFiles = [f for f in transFiles if f['LFN'] in lfnsToProcess]
    else:
      lfnsToProcess = unusedLfns

    # Check the data is available with replicas
    res = self.__getDataReplicas(transDict, lfnsToProcess, clients, forJobs=forJobs)
    if not res['OK']:
      self._logError("Failed to get data replicas:", res['Message'],
                     method=method, transID=transID)
      return res
    dataReplicas = res['Value']

    # Get the plug-in type and create the plug-in object
    self._logInfo("Processing transformation with '%s' plug-in." % plugin,
                  method=method, transID=transID)
    res = self.__generatePluginObject(plugin, clients)
    if not res['OK']:
      return res
    oPlugin = res['Value']

    # Get the plug-in and set the required params
    oPlugin.setParameters(transDict)
    oPlugin.setInputData(dataReplicas)
    oPlugin.setTransformationFiles(transFiles)
    res = oPlugin.run()
    if not res['OK']:
      self._logError("Failed to generate tasks for transformation:", res['Message'],
                     method=method, transID=transID)
      return res
    tasks = res['Value']
    self.pluginTimeout[transID] = res.get('Timeout', False)
    # Create the tasks
    allCreated = True
    created = 0
    lfnsInTasks = []
    for se, lfns in tasks:
      res = clients['TransformationClient'].addTaskForTransformation(transID, lfns, se)
      if not res['OK']:
        self._logError("Failed to add task generated by plug-in:", res['Message'],
                       method=method, transID=transID)
        allCreated = False
      else:
        created += 1
        lfnsInTasks += [lfn for lfn in lfns if lfn in lfnsToProcess]
    if created:
      self._logInfo("Successfully created %d tasks for transformation." % created,
                    method=method, transID=transID)
    else:
      self._logInfo("No new tasks created for transformation.",
                    method=method, transID=transID)
    self.unusedFiles[transID] = unusedFiles - len(lfnsInTasks)
    # If not all files were obtained, move the offset
    lastOffset = self.lastFileOffset.get(transID)
    if lastOffset:
      self.lastFileOffset[transID] = max(0, lastOffset - len(lfnsInTasks))
    self.__removeFilesFromCache(transID, lfnsInTasks)

    # If this production is to Flush
    if transDict['Status'] == 'Flush' and allCreated:
      res = clients['TransformationClient'].setTransformationParameter(transID, 'Status', 'Active')
      if not res['OK']:
        self._logError("Failed to update transformation status to 'Active':", res['Message'],
                       method=method, transID=transID)
      else:
        self._logInfo("Updated transformation status to 'Active'.",
                      method=method, transID=transID)
    return S_OK()

  ######################################################################
  #
  # Internal methods used by the agent
  #

  def _getTransformationFiles(self, transDict, clients, statusList=None, replicateOrRemove=False):
    """ get the data replicas for a certain transID
    """
    # By default, don't skip if no new Unused for DM transformations
    skipIfNoNewUnused = not replicateOrRemove
    transID = transDict['TransformationID']
    plugin = transDict.get('Plugin', 'Standard')
    # Check if files should be sorted and limited in number
    operations = Operations()
    sortedBy = operations.getValue('TransformationPlugins/%s/SortedBy' % plugin, None)
    maxFiles = operations.getValue('TransformationPlugins/%s/MaxFilesToProcess' % plugin, 0)
    # If the NoUnuse delay is explicitly set, we want to take it into account, and skip if no new Unused
    if operations.getValue('TransformationPlugins/%s/NoUnusedDelay' % plugin, 0):
      skipIfNoNewUnused = True
    noUnusedDelay = 0 if self.pluginTimeout.get(transID, False) else \
        operations.getValue('TransformationPlugins/%s/NoUnusedDelay' % plugin, self.noUnusedDelay)
    method = '_getTransformationFiles'
    lastOffset = self.lastFileOffset.setdefault(transID, 0)

    # Files that were problematic (either explicit or because SE was banned) may be recovered,
    # and always removing the missing ones
    if not statusList:
      statusList = ['Unused', 'ProbInFC']
    statusList += ['MissingInFC'] if transDict['Type'] == 'Removal' else []
    transClient = clients['TransformationClient']
    res = transClient.getTransformationFiles(condDict={'TransformationID': transID,
                                                       'Status': statusList},
                                             orderAttribute=sortedBy,
                                             offset=lastOffset, maxfiles=maxFiles)
    if not res['OK']:
      self._logError("Failed to obtain input data:", res['Message'],
                     method=method, transID=transID)
      return res
    transFiles = res['Value']
    if maxFiles and len(transFiles) == maxFiles:
      self.lastFileOffset[transID] += maxFiles
    else:
      del self.lastFileOffset[transID]

    if not transFiles:
      self._logInfo("No '%s' files found for transformation." % ','.join(statusList),
                    method=method, transID=transID)
      if transDict['Status'] == 'Flush':
        res = transClient.setTransformationParameter(transID, 'Status', 'Active')
        if not res['OK']:
          self._logError("Failed to update transformation status to 'Active':", res['Message'],
                         method=method, transID=transID)
        else:
          self._logInfo("Updated transformation status to 'Active'.",
                        method=method, transID=transID)
      return S_OK()
    # Check if transformation is kicked
    kickFile = os.path.join(self.controlDirectory, 'KickTransformation_%s' % str(transID))
    try:
      kickTrans = os.path.exists(kickFile)
      if kickTrans:
        os.remove(kickFile)
    except OSError:
      pass

    # Check if something new happened
    now = datetime.datetime.utcnow()
    if not kickTrans and skipIfNoNewUnused and noUnusedDelay:
      nextStamp = self.unusedTimeStamp.setdefault(transID, now) + datetime.timedelta(hours=noUnusedDelay)
      skip = now < nextStamp
      if len(transFiles) == self.unusedFiles.get(transID, 0) and transDict['Status'] != 'Flush' and skip:
        self._logInfo("No new '%s' files found for transformation." % ','.join(statusList),
                      method=method, transID=transID)
        return S_OK()

    self.unusedTimeStamp[transID] = now
    # If files are not Unused, set them Unused
    notUnused = [trFile['LFN'] for trFile in transFiles if trFile['Status'] != 'Unused']
    otherStatuses = sorted(set([trFile['Status'] for trFile in transFiles]) - set(['Unused']))
    if notUnused:
      res = transClient.setFileStatusForTransformation(transID, 'Unused', notUnused, force=True)
      if not res['OK']:
        self._logError("Error setting %d files Unused:" % len(notUnused), res['Message'],
                       method=method, transID=transID)
      else:
        self._logInfo("Set %d files from %s to Unused" % (len(notUnused), ','.join(otherStatuses)))
        self.__removeFilesFromCache(transID, notUnused)
    return S_OK(transFiles)

  def __applyReduction(self, lfns, maxFiles=None):
    """ eventually remove the number of files to be considered
    """
    if maxFiles is None:
      maxFiles = self.maxFiles
    if not maxFiles or len(lfns) <= maxFiles:
      return lfns
    return randomize(lfns)[:maxFiles]

  def __getDataReplicas(self, transDict, lfns, clients, forJobs=True):
    """ Get the replicas for the LFNs and check their statuses. It first looks within the cache.
    """
    method = '__getDataReplicas'
    transID = transDict['TransformationID']
    if 'RemoveFile' in transDict['Body']:
      # When removing files, we don't care about their replicas
      return S_OK(dict.fromkeys(lfns, ['None']))
    clearCacheFile = os.path.join(self.controlDirectory, 'ClearCache_%s' % str(transID))
    try:
      clearCache = os.path.exists(clearCacheFile)
      if clearCache:
        os.remove(clearCacheFile)
    except:
      pass
    if clearCache or transDict['Status'] == 'Flush':
      self._logInfo("Replica cache cleared", method=method, transID=transID)
      # We may need to get new replicas
      self.__clearCacheForTrans(transID)
    else:
      # If the cache needs to be cleaned
      self.__cleanCache(transID)
    startTime = time.time()
    dataReplicas = {}
    nLfns = len(lfns)
    self._logVerbose("Getting replicas for %d files" % nLfns, method=method, transID=transID)
    cachedReplicaSets = self.replicaCache.get(transID, {})
    cachedReplicas = {}
    # Merge all sets of replicas
    for replicas in cachedReplicaSets.itervalues():
      cachedReplicas.update(replicas)
    self._logInfo("Number of cached replicas: %d" % len(cachedReplicas), method=method, transID=transID)
    setCached = set(cachedReplicas)
    setLfns = set(lfns)
    for lfn in setLfns & setCached:
      dataReplicas[lfn] = cachedReplicas[lfn]
    newLFNs = setLfns - setCached
    self._logInfo("ReplicaCache hit for %d out of %d LFNs" % (len(dataReplicas), nLfns),
                  method=method, transID=transID)
    if newLFNs:
      startTime = time.time()
      self._logInfo("Getting replicas for %d files from catalog" % len(newLFNs),
                    method=method, transID=transID)
      newReplicas = {}
      for chunk in breakListIntoChunks(newLFNs, 10000):
        res = self._getDataReplicasDM(transID, chunk, clients, forJobs=forJobs)
        if res['OK']:
          reps = dict((lfn, ses) for lfn, ses in res['Value'].iteritems() if ses)
          newReplicas.update(reps)
          self.__updateCache(transID, reps)
        else:
          self._logWarn("Failed to get replicas for %d files" % len(chunk), res['Message'],
                        method=method, transID=transID)

      self._logInfo("Obtained %d replicas from catalog in %.1f seconds"
                    % (len(newReplicas), time.time() - startTime),
                    method=method, transID=transID)
      dataReplicas.update(newReplicas)
      noReplicas = newLFNs - set(dataReplicas)
      self.__writeCache(transID)
      if noReplicas:
        self._logWarn("Found %d files without replicas (or only in Failover)" % len(noReplicas),
                      method=method, transID=transID)
    return S_OK(dataReplicas)

  def _getDataReplicasDM(self, transID, lfns, clients, forJobs=True, ignoreMissing=False):
    """ Get the replicas for the LFNs and check their statuses, using the replica manager
    """
    method = '_getDataReplicasDM'

    startTime = time.time()
    self._logVerbose("Getting replicas%s from catalog for %d files" % (' for jobs' if forJobs else '', len(lfns)),
                     method=method, transID=transID)
    if forJobs:
      # Get only replicas eligible for jobs
      res = clients['DataManager'].getReplicasForJobs(lfns, getUrl=False)
    else:
      # Get all replicas
      res = clients['DataManager'].getReplicas(lfns, getUrl=False)
    if not res['OK']:
      return res
    replicas = res['Value']
    # Prepare a dictionary for all LFNs
    dataReplicas = {}
    self._logVerbose("Replica results for %d files obtained in %.2f seconds" %
                     (len(lfns), time.time() - startTime),
                     method=method, transID=transID)
    # If files are neither Successful nor Failed, they are set problematic in the FC
    problematicLfns = [lfn for lfn in lfns if lfn not in replicas['Successful'] and lfn not in replicas['Failed']]
    if problematicLfns:
      self._logInfo("%d files found problematic in the catalog, set ProbInFC" % len(problematicLfns))
      res = clients['TransformationClient'].setFileStatusForTransformation(transID, 'ProbInFC', problematicLfns)
      if not res['OK']:
        self._logError("Failed to update status of problematic files:", res['Message'],
                       method=method, transID=transID)
    # Create a dictionary containing all the file replicas
    failoverLfns = []
    for lfn, replicaDict in replicas['Successful'].iteritems():
      for se in replicaDict:
        # This sremains here for backward compatibility in case VOs have not defined SEs not to be used for jobs
        if forJobs and 'failover' in se.lower():
          self._logVerbose("Ignoring failover replica for %s." % lfn, method=method, transID=transID)
        else:
          dataReplicas.setdefault(lfn, []).append(se)
      if not dataReplicas.get(lfn):
        failoverLfns.append(lfn)
    if failoverLfns:
      self._logVerbose("%d files have no replica but possibly in Failover SE" % len(failoverLfns))
    # Make sure that file missing from the catalog are marked in the transformation DB.
    missingLfns = []
    for lfn, reason in replicas['Failed'].iteritems():
      if "No such file or directory" in reason:
        self._logVerbose("%s not found in the catalog." % lfn, method=method, transID=transID)
        missingLfns.append(lfn)
    if missingLfns:
      self._logInfo("%d files not found in the catalog" % len(missingLfns))
      if ignoreMissing:
        dataReplicas.update(dict.fromkeys(missingLfns, []))
      else:
        res = clients['TransformationClient'].setFileStatusForTransformation(transID, 'MissingInFC', missingLfns)
        if not res['OK']:
          self._logError("Failed to update status of missing files:", res['Message'],
                         method=method, transID=transID)
    return S_OK(dataReplicas)

  def __updateCache(self, transID, newReplicas):
    """ Add replicas to the cache
    """
    self.replicaCache.setdefault(transID, {})[datetime.datetime.utcnow()] = newReplicas
#    if len( newReplicas ) > 5000:
#      self.__writeCache( transID )

  def __clearCacheForTrans(self, transID):
    """ Remove all replicas for a transformation
    """
    self.replicaCache.pop(transID, None)

  def __cleanReplicas(self, transID, lfns):
    """ Remove cached replicas that are not in a list
    """
    cachedReplicas = set()
    for replicas in self.replicaCache.get(transID, {}).itervalues():
      cachedReplicas.update(replicas)
    toRemove = cachedReplicas - set(lfns)
    if toRemove:
      self._logInfo("Remove %d files from cache" % len(toRemove), method='__cleanReplicas', transID=transID)
      self.__removeFromCache(transID, toRemove)

  def __cleanCache(self, transID):
    """ Cleans the cache
    """
    try:
      if transID in self.replicaCache:
        timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=self.replicaCacheValidity)
        for updateTime in set(self.replicaCache[transID]):
          nCache = len(self.replicaCache[transID][updateTime])
          if updateTime < timeLimit or not nCache:
            self._logInfo("Clear %s replicas for transformation %s, time %s" %
                          ('%d cached' % nCache if nCache else 'empty cache', str(transID), str(updateTime)),
                          transID=transID, method='__cleanCache')
            del self.replicaCache[transID][updateTime]
        # Remove empty transformations
        if not self.replicaCache[transID]:
          del self.replicaCache[transID]
    except Exception as x:
      self._logException("Exception when cleaning replica cache:", lException=x)

  def __removeFilesFromCache(self, transID, lfns):
    removed = self.__removeFromCache(transID, lfns)
    if removed:
      self._logInfo("Removed %d replicas from cache" % removed, method='__removeFilesFromCache', transID=transID)
      self.__writeCache(transID)

  def __removeFromCache(self, transID, lfns):
    if transID not in self.replicaCache:
      return
    removed = 0
    if self.replicaCache[transID] and lfns:
      for lfn in lfns:
        for timeKey in self.replicaCache[transID]:
          if self.replicaCache[transID][timeKey].pop(lfn, None):
            removed += 1
    return removed

  def __cacheFile(self, transID):
    return self.cacheFile.replace('.pkl', '_%s.pkl' % str(transID))

  @gSynchro
  def __readCache(self, transID):
    """ Reads from the cache
    """
    if transID in self.replicaCache:
      return
    try:
      method = '__readCache'
      fileName = self.__cacheFile(transID)
      if not os.path.exists(fileName):
        self.replicaCache[transID] = {}
      else:
        with open(fileName, 'r') as cacheFile:
          self.replicaCache[transID] = pickle.load(cacheFile)
        self._logInfo("Successfully loaded replica cache from file %s (%d files)" %
                      (fileName, self.__filesInCache(transID)),
                      method=method, transID=transID)
    except Exception as x:
      self._logException("Failed to load replica cache from file %s" % fileName, lException=x,
                         method=method, transID=transID)
      self.replicaCache[transID] = {}

  def __filesInCache(self, transID):
    cache = self.replicaCache.get(transID, {})
    return sum(len(lfns) for lfns in cache.itervalues())

  @gSynchro
  def __writeCache(self, transID=None):
    """ Writes the cache
    """
    method = '__writeCache'
    try:
      startTime = time.time()
      transList = [transID] if transID else set(self.replicaCache)
      filesInCache = 0
      nCache = 0
      for t_id in transList:
        # Protect the copy of the cache
        filesInCache += self.__filesInCache(t_id)
        # write to a temporary file in order to avoid corrupted files
        cacheFile = self.__cacheFile(t_id)
        tmpFile = cacheFile + '.tmp'
        with open(tmpFile, 'w') as fd:
          pickle.dump(self.replicaCache.get(t_id, {}), fd)
        # Now rename the file as it shold
        os.rename(tmpFile, cacheFile)
        nCache += 1
      self._logInfo("Successfully wrote %d replica cache file(s) (%d files) in %.1f seconds"
                    % (nCache, filesInCache, time.time() - startTime),
                    method=method, transID=transID if transID else None)
    except Exception as x:
      self._logException("Could not write replica cache file %s" % cacheFile, lException=x,
                         method=method, transID=t_id)

  def __generatePluginObject(self, plugin, clients):
    """ This simply instantiates the TransformationPlugin class with the relevant plugin name
    """
    try:
      plugModule = __import__(self.pluginLocation, globals(), locals(), ['TransformationPlugin'])
    except ImportError as e:
      self._logException("Failed to import 'TransformationPlugin' %s" % plugin, lException=e,
                         method="__generatePluginObject")
      return S_ERROR()
    try:
      plugin_o = getattr(plugModule, 'TransformationPlugin')('%s' % plugin,
                                                             transClient=clients['TransformationClient'],
                                                             dataManager=clients['DataManager'])
      return S_OK(plugin_o)
    except AttributeError as e:
      self._logException("Failed to create %s()" % plugin, lException=e, method="__generatePluginObject")
      return S_ERROR()
    plugin_o.setDirectory(self.workDirectory)
    plugin_o.setCallback(self.pluginCallback)

  def pluginCallback(self, transID, invalidateCache=False):
    """ Standard plugin callback
    """
    if invalidateCache:
      try:
        if transID in self.replicaCache:
          self._logInfo("Removed cached replicas for transformation", method='pluginCallBack', transID=transID)
          self.replicaCache.pop(transID)
          self.__writeCache(transID)
      except:
        pass