Пример #1
0
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSUnmerged module
        :param msConfig: micro service configuration
        """
        super(MSUnmerged, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("limitFilesPerRSE", 200)
        self.msConfig.setdefault("skipRSEs", [])
        self.msConfig.setdefault("rseExpr", "*")
        self.msConfig.setdefault("enableRealMode", False)
        self.msConfig.setdefault("dumpRSE", False)
        self.msConfig.setdefault("gfalLogLevel", 'normal')
        self.msConfig.setdefault("dirFilterIncl", [])
        self.msConfig.setdefault("dirFilterExcl", [])
        self.msConfig.setdefault("emulateGfal2", False)
        self.msConfig.setdefault("filesToDeleteSliceSize", 100)
        if self.msConfig['emulateGfal2'] is False and gfal2 is None:
            msg = "Failed to import gfal2 library while it's not "
            msg += "set to emulate it. Crashing the service!"
            raise ImportError(msg)

        # TODO: Add 'alertManagerUrl' to msConfig'
        # self.alertServiceName = "ms-unmerged"
        # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger)

        # Instantiating the Rucio Consistency Monitor Client
        self.rucioConMon = RucioConMon(self.msConfig['rucioConMon'], logger=self.logger)

        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineUnmerged'
        self.plineUnmerged = Pipeline(name=pName,
                                      funcLine=[Functor(self.updateRSETimestamps, start=True, end=False),
                                                Functor(self.consRecordAge),
                                                Functor(self.getUnmergedFiles),
                                                Functor(self.filterUnmergedFiles),
                                                Functor(self.getPfn),
                                                Functor(self.cleanRSE),
                                                Functor(self.updateRSECounters, pName),
                                                Functor(self.updateRSETimestamps, start=False, end=True),
                                                Functor(self.purgeRseObj, dumpRSE=self.msConfig['dumpRSE'])])
        # Initialization of the deleted files counters:
        self.rseCounters = {}
        self.plineCounters = {}
        self.rseTimestamps = {}
        self.rseConsStats = {}
        self.protectedLFNs = []

        # The basic /store/unmerged regular expression:
        self.regStoreUnmergedLfn = re.compile("^/store/unmerged/.*$")
        self.regStoreUnmergedPfn = re.compile("^.+/store/unmerged/.*$")
Пример #2
0
    def addAdditionalMonitorReport(self, config):
        """
        _addAdditionalMonitorReport_

        Collect general request information and post it to both central
        couchdb and MonIT. Items to fetch:
         * number of requests in each status (excluding archived ones)
         * for requests in assignment-approved:
           * their priority
           * their RequestNumEvents (if there is no input dataset)
         * for assignment-approved, assigned, acquired, running-* and completed:
           * number of requests in each campaign
        """
        self.logger.info("Collecting ReqMgr2 statistics...")
        wmstatsSvc = WMStatsServer(config.wmstatsSvc_url, logger=self.logger)

        results = initMetrics()

        inputConditon = {}
        outputMask = [
            "RequestStatus", "RequestType", "RequestPriority", "Campaign",
            "RequestNumEvents"
        ]
        startT = int(time.time())
        for reqInfo in wmstatsSvc.getFilteredActiveData(
                inputConditon, outputMask):
            status = reqInfo['RequestStatus']
            results['requestsByStatus'][status] += 1

            for campaign in _getCampaign(reqInfo["Campaign"]):
                results["requestsByStatusAndCampaign"][status].setdefault(
                    campaign, 0)
                results['requestsByStatusAndCampaign'][status][campaign] += 1

            requestPrio = reqInfo['RequestPriority']
            results["requestsByStatusAndPrio"][status].setdefault(
                requestPrio, 0)
            results['requestsByStatusAndPrio'][status][requestPrio] += 1

            results['requestsByStatusAndNumEvts'][
                status] += _getRequestNumEvents(reqInfo['RequestNumEvents'])

        endT = int(time.time())
        results["total_query_time"] = endT - startT

        if self.postToAMQ:
            allDocs = self.buildMonITDocs(results)
            self.uploadToAMQ(allDocs)

        return results
Пример #3
0
    def addAdditionalMonitorReport(self, config):
        """
        _addAdditionalMonitorReport_

        Collect general request information and post it to both central
        couchdb and MonIT. Items to fetch:
         * number of requests in each status (excluding archived ones)
         * for requests in assignment-approved:
           * their priority
           * their RequestNumEvents (if there is no input dataset)
         * for assignment-approved, assigned, acquired, running-* and completed:
           * number of requests in each campaign
        """
        self.logger.info("Collecting ReqMgr2 statistics...")
        wmstatsSvc = WMStatsServer(config.wmstatsSvc_url, logger=self.logger)

        results = initMetrics()

        inputConditon = {}
        outputMask = ["RequestStatus", "RequestType", "RequestPriority",
                      "Campaign", "RequestNumEvents"]
        startT = int(time.time())
        for reqInfo in wmstatsSvc.getFilteredActiveData(inputConditon, outputMask):
            status = reqInfo['RequestStatus']
            results['requestsByStatus'][status] += 1

            for campaign in _getCampaign(reqInfo["Campaign"]):
                results["requestsByStatusAndCampaign"][status].setdefault(campaign, 0)
                results['requestsByStatusAndCampaign'][status][campaign] += 1

            requestPrio = reqInfo['RequestPriority']
            results["requestsByStatusAndPrio"][status].setdefault(requestPrio, 0)
            results['requestsByStatusAndPrio'][status][requestPrio] += 1

            results['requestsByStatusAndNumEvts'][status] += _getRequestNumEvents(reqInfo['RequestNumEvents'])

        endT = int(time.time())
        results["total_query_time"] = endT - startT

        if self.postToAMQ:
            allDocs = self.buildMonITDocs(results)
            self.uploadToAMQ(allDocs)

        return results
Пример #4
0
    def advanceStatus(self, config):
        """
        gather active data statistics
        """

        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)
        wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing status")
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)
        moveToArchived(wmstatsSvc, reqmgrSvc, config.archiveDelayHours,
                       self.logger)

        self.logger.info("Done advancing status")

        return
Пример #5
0
    def advanceStatus(self, config):
        """
        Advance the request status based on the global workqueue elements status
        """
        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)
        wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger)
        logdb = LogDB(config.central_logdb_url, config.log_reporter)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing status")
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)
        moveToArchived(wmstatsSvc, reqmgrSvc, logdb, config.archiveDelayHours,
                       self.logger)

        self.logger.info("Done advancing status")

        return
Пример #6
0
    def advanceStatus(self, config):
        """
        Advance the request status based on the global workqueue elements status
        """
        reqmgrSvc = ReqMgr(config.reqmgr2_url, logger=self.logger)
        gqService = WorkQueue(config.workqueue_url)
        wmstatsSvc = WMStatsServer(config.wmstats_url, logger=self.logger)

        self.logger.info("Getting GQ data for status check")
        wfStatusDict = gqService.getWorkflowStatusFromWQE()

        self.logger.info("Advancing statuses")
        if getattr(config, "enableMSStatusTransition", False):
            moveTransferorStatus(reqmgrSvc, self.logger)
        moveForwardStatus(reqmgrSvc, wfStatusDict, self.logger)
        moveToCompletedForNoWQJobs(reqmgrSvc, wfStatusDict, self.logger)
        moveToArchived(wmstatsSvc, reqmgrSvc, self.logDB,
                       config.archiveDelayHours, self.logger)

        self.logger.info("Done advancing status")

        return
Пример #7
0
    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        # Tier0 Agent don't need this
        if hasattr(self.config, "Tier0Feeder"):
            self.wmstatsServerSvc = None
        else:
            wmstatsSvcURL = self.config.General.centralWMStatsURL.replace(
                "couchdb/wmstats", "wmstatsserver")
            self.wmstatsServerSvc = WMStatsServer(wmstatsSvcURL)

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.updateBlocksDAO = daoFactory(classname="UpdateBlocks")
        self.updateFilesDAO = daoFactory(classname="UpdateFiles")
        self.createBlocksDAO = daoFactory(classname="CreateBlocks")
        self.setBlockFilesDAO = daoFactory(classname="SetBlockFiles")

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup",
                                    "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType",
                                   "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload,
                                          "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(
            getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
            'dbsuploader_block.json')

        self.timeoutWaiver = 1

        self.datasetParentageCache = {}

        return
Пример #8
0
class DBSUploadPoller(BaseWorkerThread):
    """
    Handles poll-based DBSUpload

    """
    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        # Tier0 Agent don't need this
        if hasattr(self.config, "Tier0Feeder"):
            self.wmstatsServerSvc = None
        else:
            wmstatsSvcURL = self.config.General.centralWMStatsURL.replace(
                "couchdb/wmstats", "wmstatsserver")
            self.wmstatsServerSvc = WMStatsServer(wmstatsSvcURL)

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.updateBlocksDAO = daoFactory(classname="UpdateBlocks")
        self.updateFilesDAO = daoFactory(classname="UpdateFiles")
        self.createBlocksDAO = daoFactory(classname="CreateBlocks")
        self.setBlockFilesDAO = daoFactory(classname="SetBlockFiles")

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup",
                                    "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType",
                                   "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload,
                                          "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(
            getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
            'dbsuploader_block.json')

        self.timeoutWaiver = 1

        self.datasetParentageCache = {}

        return

    def setupPool(self):
        """
        _setupPool_

        Set up the processing pool for work
        """
        if self.pool:
            # Then something already exists.  Continue
            return

        self.workInput = multiprocessing.Queue()
        self.workResult = multiprocessing.Queue()

        # Starting up the pool:
        for _ in range(self.nProc):
            p = multiprocessing.Process(target=uploadWorker,
                                        args=(self.workInput, self.workResult,
                                              self.dbsUrl))
            p.start()
            self.pool.append(p)

        return

    def __del__(self):
        """
        __del__

        Trigger a close of connections if necessary
        """
        self.close()
        return

    def close(self):
        """
        _close_

        Kill all connections and terminate
        """
        terminate = False
        for _ in self.pool:
            try:
                self.workInput.put('STOP')
            except Exception as ex:
                # Something very strange happens here
                # It's like it raises a blank exception
                # Upon being told to return
                msg = "Hit some exception in deletion\n"
                msg += str(ex)
                logging.debug(msg)
                terminate = True
        try:
            self.workInput.close()
            self.workResult.close()
        except Exception:
            # What are you going to do?
            pass
        for proc in self.pool:
            if terminate:
                proc.terminate()
            else:
                proc.join()
        self.pool = []
        self.workInput = None
        self.workResult = None
        return

    def terminate(self, parameters):
        """
        Do one more pass, then terminate

        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(parameters)

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        First, check blocks that may be already uploaded
        Then, load blocks
        Then, load files
        Then, move files into blocks
        Then add new blocks in DBSBuffer
        Then add blocks to DBS
        Then mark blocks as done in DBSBuffer
        """
        logging.info("Starting the DBSUpload Polling Cycle")
        # refreshing parentageCache every cycle
        if self.updateDatasetParentageCache() is False:
            return

        logging.debug("Dataset parentage map: %s", self.datasetParentageCache)
        try:
            self.checkBlocks()
            self.loadBlocks()
            self.loadFiles()
            self.checkBlockCompletion()
            self.inputBlocks()
            self.retrieveBlocks()
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled Exception in DBSUploadPoller! Error: %s" % str(ex)
            logging.exception(msg)
            raise DBSUploadException(msg)

    def updateDatasetParentageCache(self):
        """
        Return True to indicate it successfully fetched the parentage
        map. If there was an exception, return False
        """
        myThread = threading.currentThread()

        success = True
        if not self.wmstatsServerSvc:
            self.datasetParentageCache = {}
            return success

        try:
            self.datasetParentageCache = self.wmstatsServerSvc.getChildParentDatasetMap(
            )
        except Exception as ex:
            excReason = getattr(ex, 'reason', '')
            errorMsg = 'Failed to fetch parentage map from WMStats, skipping this cycle. '
            errorMsg += 'Exception: {}. Reason: {}. Error: {}. '.format(
                type(ex).__name__, excReason, str(ex))
            if isPassiveError(ex):
                logging.warning(errorMsg)
            else:
                errorMsg += 'Hit a terminal exception in DBSUploadPoller.'
                raise DBSUploadException(errorMsg)
            myThread.logdbClient.post("DBS3Upload_parentMap", errorMsg,
                                      "warning")
            success = False
        else:
            myThread.logdbClient.delete("DBS3Upload_parentMap",
                                        "warning",
                                        this_thread=True)

        return success

    def loadBlocks(self):
        """
        _loadBlocks_

        Find all blocks; make sure they're in the cache
        """
        openBlocks = self.dbsUtil.findOpenBlocks()
        logging.info("Found %d open blocks.", len(openBlocks))
        logging.debug("These are the openblocks: %s", openBlocks)

        # Load them if we don't have them
        blocksToLoad = []
        for block in openBlocks:
            if block['blockname'] not in self.blockCache:
                blocksToLoad.append(block['blockname'])

        # Now load the blocks
        try:
            loadedBlocks = self.dbsUtil.loadBlocks(blocksToLoad)
            logging.info("Loaded %d blocks.", len(loadedBlocks))
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while loading blocks.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks to load: %s\n", blocksToLoad)
            raise DBSUploadException(msg)

        for blockInfo in loadedBlocks:
            block = DBSBufferBlock(name=blockInfo['block_name'],
                                   location=blockInfo['origin_site_name'],
                                   datasetpath=blockInfo['datasetpath'])

            parent = self.datasetParentageCache.get(blockInfo['datasetpath'])
            if parent:
                block.addDatasetParent(parent)
                logging.debug(
                    "Load block: Child dataset %s, Parent dataset %s",
                    blockInfo['datasetpath'], parent)
            block.FillFromDBSBuffer(blockInfo)
            blockname = block.getName()

            # Now we have to load files...
            try:
                files = self.dbsUtil.loadFilesByBlock(blockname=blockname)
                logging.info("Have %i files for block %s", len(files),
                             blockname)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading files for existing blocks.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks being loaded: %s\n", blockname)
                raise DBSUploadException(msg)

            # Add the loaded files to the block
            for f in files:
                block.addFile(f, self.datasetType, self.primaryDatasetType)

            # Add to the cache
            self.blockCache[blockInfo['block_name']] = block

        return

    def loadFiles(self):
        """
        _loadFiles_

        Load all files that need to be loaded.  I will do this by DatasetPath
        to break the monstrous calls down into smaller chunks.
        """
        dspList = self.dbsUtil.findUploadableDAS()

        readyBlocks = []
        for dspInfo in dspList:

            datasetpath = dspInfo['DatasetPath']

            # Get the files
            try:
                loadedFiles = self.dbsUtil.findUploadableFilesByDAS(
                    datasetpath=datasetpath)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading uploadable files for DatasetPath.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("DatasetPath being loaded: %s\n", datasetpath)
                raise DBSUploadException(msg)

            # Sort the files and blocks by location
            fileDict = sortListByKey(loadedFiles, 'locations')

            # Now add each file
            for location in fileDict.keys():

                files = fileDict.get(location)

                if files:
                    currentBlock = self.getBlock(files[0], location, True)
                    currentBlock.setAcquisitionEra(
                        era=dspInfo['AcquisitionEra'])
                    currentBlock.setProcessingVer(
                        procVer=dspInfo['ProcessingVer'])

                    for newFile in files:

                        if not newFile.get('block', 1) is None:
                            # Then this file already has a block
                            # It should be accounted for somewhere
                            # Or loaded with the block
                            continue

                        # Check if we can put files in this block
                        if not self.isBlockOpen(newFile=newFile,
                                                block=currentBlock):
                            # Then we have to close the block and get a new one
                            currentBlock.setPendingAndCloseBlock()
                            readyBlocks.append(currentBlock)
                            currentBlock = self.getBlock(newFile=newFile,
                                                         location=location)
                            currentBlock.setAcquisitionEra(
                                era=dspInfo['AcquisitionEra'])
                            currentBlock.setProcessingVer(
                                procVer=dspInfo['ProcessingVer'])

                        # Now deal with the file
                        currentBlock.addFile(newFile, self.datasetType,
                                             self.primaryDatasetType)
                        self.filesToUpdate.append({
                            'filelfn':
                            newFile['lfn'],
                            'block':
                            currentBlock.getName()
                        })
                    # Done with the location
                    readyBlocks.append(currentBlock)

        for block in readyBlocks:
            self.blockCache[block.getName()] = block

        return

    def checkBlockCompletion(self):
        """
        _checkBlockCompletion_

        Mark Open blocks as Pending if they have timed out or their workflows have completed
        """
        completedWorkflows = self.dbsUtil.getCompletedWorkflows()
        for block in self.blockCache.values():
            if block.status == "Open":
                if (block.getTime() > block.getMaxBlockTime()) or any(
                        key in completedWorkflows for key in block.workflows):
                    block.setPendingAndCloseBlock()

        return

    def isBlockOpen(self, newFile, block, doTime=False):
        """
        _isBlockOpen_

        Check and see if a block is full
        This will check on time, but that's disabled by default
        The plan is to do a time check after we do everything else,
        so open blocks about to time out can still get more
        files put in them.
        """

        if block.getMaxBlockFiles() is None or block.getMaxBlockNumEvents() is None or \
                block.getMaxBlockSize() is None or block.getMaxBlockTime() is None:
            return True
        if block.status != 'Open':
            # Then somebody has dumped this already
            return False
        if block.getSize() + newFile['size'] > block.getMaxBlockSize():
            return False
        if block.getNumEvents(
        ) + newFile['events'] > block.getMaxBlockNumEvents():
            return False
        if block.getNFiles() >= block.getMaxBlockFiles():
            # Then we have to dump it because this file
            # will put it over the limit.
            return False
        if block.getTime() > block.getMaxBlockTime() and doTime:
            return False

        return True

    def getBlock(self, newFile, location, skipOpenCheck=False):
        """
        _getBlock_

        Retrieve a block is one exists with matching datasetpath/location and is open.
        If no such block is found create and return a new one.
        """
        datasetpath = newFile["datasetPath"]

        for block in self.blockCache.values():
            if datasetpath == block.getDatasetPath(
            ) and location == block.getLocation():
                if not self.isBlockOpen(newFile=newFile,
                                        block=block) and not skipOpenCheck:
                    # Block isn't open anymore.  Mark it as pending so that it gets uploaded.
                    block.setPendingAndCloseBlock()
                else:
                    return block

        # A suitable open block does not exist.  Create a new one.
        blockname = "%s#%s" % (datasetpath, makeUUID())
        newBlock = DBSBufferBlock(name=blockname,
                                  location=location,
                                  datasetpath=datasetpath)

        parent = self.datasetParentageCache.get(datasetpath)
        if parent:
            newBlock.addDatasetParent(parent)
            logging.debug("Get block: Child dataset %s, Parent dataset %s",
                          datasetpath, parent)

        self.blockCache[blockname] = newBlock
        return newBlock

    def inputBlocks(self):
        """
        _inputBlocks_

        Loop through all of the "active" blocks and sort them so we can act
        appropriately on them.  Everything will be sorted based on the
        following:
         Queued - Block is already being acted on by another process.  We just
          ignore it.
         Pending, not in DBSBuffer - Block that has been closed and needs to
           be injected into DBS and also written to DBSBuffer.  We'll do both.
         Pending, in DBSBuffer - Block has been closed and written to
           DBSBuffer.  We just need to inject it into DBS.
         Open, not in DBSBuffer - Newly created block that needs to be written
           not DBSBuffer.
         Open, in DBSBuffer - Newly created block that has already been
           written to DBSBuffer.  We don't have to do anything with it.
        """
        if not self.blockCache:
            return

        myThread = threading.currentThread()

        createInDBS = []
        createInDBSBuffer = []
        updateInDBSBuffer = []
        for block in self.blockCache.values():
            if block.getName() in self.queuedBlocks:
                # Block is already being dealt with by another process.  We'll
                # ignore it here.
                continue
            if block.status == 'Pending':
                # All pending blocks need to be injected into DBS.
                createInDBS.append(block)

                # If this is a new block it needs to be added to DBSBuffer
                # otherwise it just needs to be updated in DBSBuffer.
                if not block.inBuff:
                    createInDBSBuffer.append(block)
                else:
                    updateInDBSBuffer.append(block)
            if block.status == 'Open' and not block.inBuff:
                # New block that needs to be added to DBSBuffer.
                createInDBSBuffer.append(block)

        # First handle new and updated blocks
        if createInDBSBuffer or updateInDBSBuffer:
            try:
                myThread.transaction.begin()
                if createInDBSBuffer:
                    self.createBlocksDAO.execute(
                        blocks=createInDBSBuffer,
                        conn=myThread.transaction.conn,
                        transaction=True)
                if updateInDBSBuffer:
                    self.updateBlocksDAO.execute(
                        blocks=updateInDBSBuffer,
                        conn=myThread.transaction.conn,
                        transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while writing new blocks into DBSBuffer\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks for DBSBuffer: %s\n", createInDBSBuffer)
                logging.debug("Blocks for Update: %s\n", updateInDBSBuffer)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        # Update block status in the block cache.  Mark the blocks that we have
        # added to DBSBuffer as being in DBSBuffer.
        for block in createInDBSBuffer:
            self.blockCache.get(block.getName()).inBuff = True

        # Record new file/block associations in DBSBuffer.
        if self.filesToUpdate:
            try:
                myThread.transaction.begin()
                self.setBlockFilesDAO.execute(binds=self.filesToUpdate,
                                              conn=myThread.transaction.conn,
                                              transaction=True)
                self.filesToUpdate = []
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while setting blocks in files.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Files to Update: %s\n", self.filesToUpdate)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        if not createInDBS:
            # then there is nothing else to do
            return

        # Build the pool if it was closed
        if not self.pool:
            self.setupPool()

        # Finally upload blocks to DBS.
        for block in createInDBS:
            if not block.files:
                # What are we doing?
                logging.debug("Skipping empty block")
                continue
            if block.getDataset() is None:
                # Then we have to fix the dataset
                dbsFile = block.files[0]
                block.setDataset(datasetName=dbsFile['datasetPath'],
                                 primaryType=self.primaryDatasetType,
                                 datasetType=self.datasetType,
                                 physicsGroup=dbsFile.get(
                                     'physicsGroup', None),
                                 prep_id=dbsFile.get('prep_id', None))
            logging.debug("Found block %s in blocks", block.getName())
            block.setPhysicsGroup(group=self.physicsGroup)

            encodedBlock = block.convertToDBSBlock()
            logging.info("About to insert block %s", block.getName())
            self.workInput.put({
                'name': block.getName(),
                'block': encodedBlock
            })
            self.blockCount += 1
            if self.produceCopy:
                with open(self.copyPath, 'w') as jo:
                    json.dump(encodedBlock, jo, indent=2)
            self.queuedBlocks.append(block.getName())

        # And all work is in and we're done for now
        return

    def retrieveBlocks(self):
        """
        _retrieveBlocks_

        Once blocks are in DBS, we have to retrieve them and see what's
        in them.  What we do is get everything out of the result queue,
        and then update it in DBSBuffer.

        To do this, the result queue needs to pass back the blockname
        """
        myThread = threading.currentThread()

        blocksToClose = []
        emptyCount = 0
        while self.blockCount > 0:
            if emptyCount > self.nTries:

                # When timeoutWaiver is 0 raise error.
                # It could take long time to get upload data to DBS
                # if there are a lot of files are cumulated in the buffer.
                # in first try but second try should be faster.
                # timeoutWaiver is set as component variable - only resets when component restarted.
                # The reason for that is only back log will occur when component is down
                # for a long time while other component still running and feeding the data to
                # dbsbuffer

                if self.timeoutWaiver == 0:
                    msg = "Exceeded max number of waits while waiting for DBS to finish"
                    raise DBSUploadException(msg)
                else:
                    self.timeoutWaiver = 0
                    return
            try:
                # Get stuff out of the queue with a ridiculously
                # short wait time
                blockresult = self.workResult.get(timeout=self.wait)
                blocksToClose.append(blockresult)
                self.blockCount -= 1
                logging.debug("Got a block to close")
            except queue.Empty:
                # This means the queue has no current results
                time.sleep(2)
                emptyCount += 1
                continue

        loadedBlocks = []
        for result in blocksToClose:
            # Remove from list of work being processed
            self.queuedBlocks.remove(result.get('name'))
            if result["success"] == "uploaded":
                block = self.blockCache.get(result.get('name'))
                block.status = 'InDBS'
                loadedBlocks.append(block)
            elif result["success"] == "check":
                block = result["name"]
                self.blocksToCheck.append(block)
            else:
                logging.error(
                    "Error found in multiprocess during process of block %s",
                    result.get('name'))
                logging.error(result['error'])
                # Continue to the next block
                # Block will remain in pending status until it is transferred

        if loadedBlocks:
            try:
                myThread.transaction.begin()
                self.updateFilesDAO.execute(blocks=loadedBlocks,
                                            status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
                self.updateBlocksDAO.execute(blocks=loadedBlocks,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
            except Exception as ex:
                myThread.transaction.rollback()
                # possible deadlock with PhEDExInjector, retry once after 10s
                logging.warning(
                    "Oracle exception, possible deadlock due to race condition, retry after 10s sleep"
                )
                time.sleep(10)
                try:
                    myThread.transaction.begin()
                    self.updateFilesDAO.execute(blocks=loadedBlocks,
                                                status="InDBS",
                                                conn=myThread.transaction.conn,
                                                transaction=True)
                    self.updateBlocksDAO.execute(
                        blocks=loadedBlocks,
                        conn=myThread.transaction.conn,
                        transaction=True)
                except Exception as ex:
                    myThread.transaction.rollback()
                    msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Blocks for Update: %s\n", loadedBlocks)
                    raise DBSUploadException(msg)
                else:
                    myThread.transaction.commit()

            else:
                myThread.transaction.commit()

        for block in loadedBlocks:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean up the pool so we don't have stuff waiting around
        if self.pool:
            self.close()

        # And we're done
        return

    def checkBlocks(self):
        """
        _checkBlocks_

        Check with DBS3 if the blocks marked as check are
        uploaded or not.
        """
        myThread = threading.currentThread()

        blocksUploaded = []

        # See if there is anything to check
        for block in self.blocksToCheck:
            logging.debug("Checking block existence: %s", block)
            # Check in DBS if the block was really inserted
            try:
                result = self.dbsApi.listBlocks(block_name=block)
                # it is an empty list if block cannot be found
                if result:
                    loadedBlock = self.blockCache.get(block)
                    loadedBlock.status = 'InDBS'
                    blocksUploaded.append(loadedBlock)
            except Exception as ex:
                msg = "Error trying to check block %s through DBS. Error: %s" % (
                    block, str(ex))
                logging.exception(msg)

        # Update the status of those blocks that were truly inserted
        if blocksUploaded:
            try:
                myThread.transaction.begin()
                self.updateBlocksDAO.execute(blocks=blocksUploaded,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
                self.updateFilesDAO.execute(blocks=blocksUploaded,
                                            status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                msg += str(ex)
                logging.exception(msg)
                logging.debug("Blocks for Update: %s\n", blocksUploaded)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        for block in blocksUploaded:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean the check list
        self.blocksToCheck = []

        # We're done
        return
Пример #9
0
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSRuleCleaner module
        :param msConfig: micro service configuration
        """
        super(MSRuleCleaner, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("services", ['ruleCleaner'])
        self.msConfig.setdefault("rucioWmaAccount", "wma_test")
        self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor")
        self.msConfig.setdefault('enableRealMode', False)

        self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode"
        self.curlMgr = RequestHandler()
        self.targetStatusRegex = re.compile(r'.*archived')
        self.logDB = LogDB(self.msConfig["logDBUrl"],
                           self.msConfig["logDBReporter"],
                           logger=self.logger)
        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineMSTrCont'
        self.plineMSTrCont = Pipeline(name=pName,
                                      funcLine=[Functor(self.setPlineMarker, pName),
                                                Functor(self.setParentDatasets),
                                                Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']),
                                                Functor(self.cleanRucioRules)])
        pName = 'plineMSTrBlock'
        self.plineMSTrBlock = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.setParentDatasets),
                                                 Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentCont'
        self.plineAgentCont = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentBlock'
        self.plineAgentBlock = Pipeline(name=pName,
                                        funcLine=[Functor(self.setPlineMarker, pName),
                                                  Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']),
                                                  Functor(self.cleanRucioRules)])
        pName = 'plineArchive'
        self.plineArchive = Pipeline(name=pName,
                                     funcLine=[Functor(self.setPlineMarker, pName),
                                               Functor(self.findTargetStatus),
                                               Functor(self.setClean),
                                               Functor(self.setArchivalDelayExpired),
                                               Functor(self.setLogDBClean),
                                               Functor(self.archive)])

        # Building the different set of plines we will need later:
        # NOTE: The following are all the functional pipelines which are supposed to include
        #       a cleanup function and report cleanup status in the MSRuleCleanerWflow object
        self.cleanuplines = [self.plineMSTrCont,
                             self.plineMSTrBlock,
                             self.plineAgentCont,
                             self.plineAgentBlock]
        # Building an auxiliary list of cleanup pipeline names only:
        self.cleanupPipeNames = [pline.name for pline in self.cleanuplines]

        # Building lists of pipelines related only to Agents or MStransferror
        self.agentlines = [self.plineAgentCont,
                           self.plineAgentBlock]
        self.mstrlines = [self.plineMSTrCont,
                          self.plineMSTrBlock]

        # Initialization of the 'cleaned' and 'archived' counters:
        self.wfCounters = {'cleaned': {},
                           'archived': {'normalArchived': 0,
                                        'forceArchived': 0}}
        self.globalLocks = set()
Пример #10
0
class MSRuleCleaner(MSCore):
    """
    MSRuleCleaner.py class provides the logic used to clean the Rucio
    block level data placement rules created by WMAgent.
    """

    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSRuleCleaner module
        :param msConfig: micro service configuration
        """
        super(MSRuleCleaner, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("services", ['ruleCleaner'])
        self.msConfig.setdefault("rucioWmaAccount", "wma_test")
        self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor")
        self.msConfig.setdefault('enableRealMode', False)

        self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode"
        self.curlMgr = RequestHandler()
        self.targetStatusRegex = re.compile(r'.*archived')
        self.logDB = LogDB(self.msConfig["logDBUrl"],
                           self.msConfig["logDBReporter"],
                           logger=self.logger)
        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineMSTrCont'
        self.plineMSTrCont = Pipeline(name=pName,
                                      funcLine=[Functor(self.setPlineMarker, pName),
                                                Functor(self.setParentDatasets),
                                                Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']),
                                                Functor(self.cleanRucioRules)])
        pName = 'plineMSTrBlock'
        self.plineMSTrBlock = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.setParentDatasets),
                                                 Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentCont'
        self.plineAgentCont = Pipeline(name=pName,
                                       funcLine=[Functor(self.setPlineMarker, pName),
                                                 Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']),
                                                 Functor(self.cleanRucioRules)])
        pName = 'plineAgentBlock'
        self.plineAgentBlock = Pipeline(name=pName,
                                        funcLine=[Functor(self.setPlineMarker, pName),
                                                  Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']),
                                                  Functor(self.cleanRucioRules)])
        pName = 'plineArchive'
        self.plineArchive = Pipeline(name=pName,
                                     funcLine=[Functor(self.setPlineMarker, pName),
                                               Functor(self.findTargetStatus),
                                               Functor(self.setClean),
                                               Functor(self.setArchivalDelayExpired),
                                               Functor(self.setLogDBClean),
                                               Functor(self.archive)])

        # Building the different set of plines we will need later:
        # NOTE: The following are all the functional pipelines which are supposed to include
        #       a cleanup function and report cleanup status in the MSRuleCleanerWflow object
        self.cleanuplines = [self.plineMSTrCont,
                             self.plineMSTrBlock,
                             self.plineAgentCont,
                             self.plineAgentBlock]
        # Building an auxiliary list of cleanup pipeline names only:
        self.cleanupPipeNames = [pline.name for pline in self.cleanuplines]

        # Building lists of pipelines related only to Agents or MStransferror
        self.agentlines = [self.plineAgentCont,
                           self.plineAgentBlock]
        self.mstrlines = [self.plineMSTrCont,
                          self.plineMSTrBlock]

        # Initialization of the 'cleaned' and 'archived' counters:
        self.wfCounters = {'cleaned': {},
                           'archived': {'normalArchived': 0,
                                        'forceArchived': 0}}
        self.globalLocks = set()

    def getGlobalLocks(self):
        """
        Fetches the list of 'globalLocks' from wmstats server and the list of
        'parentLocks' from request manager. Stores/updates the unified set in
        the 'globalLocks' instance variable. Returns the resultant unified set.
        :return: A union set of the 'globalLocks' and the 'parentLocks' lists
        """
        self.logger.info("Fetching globalLocks list from wmstats server.")
        try:
            globalLocks = set(self.wmstatsSvc.getGlobalLocks())
        except Exception as ex:
            msg = "Failed to refresh global locks list for the current polling cycle. Error: %s "
            msg += "Skipping this polling cycle."
            self.logger.error(msg, str(ex))
            raise ex
        self.logger.info("Fetching parentLocks list from reqmgr2 server.")
        try:
            parentLocks = set(self.reqmgr2.getParentLocks())
        except Exception as ex:
            msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s "
            msg += "Skipping this polling cycle."
            self.logger.error(msg, str(ex))
            raise ex
        self.globalLocks = globalLocks | parentLocks

    def resetCounters(self):
        """
        A simple function for zeroing the cleaned and archived counters.
        """
        for pline in self.cleanuplines:
            self.wfCounters['cleaned'][pline.name] = 0
        self.wfCounters['archived']['normalArchived'] = 0
        self.wfCounters['archived']['forceArchived'] = 0

    def execute(self, reqStatus):
        """
        Executes the whole ruleCleaner logic
        :return: summary
        """
        # start threads in MSManager which should call this method
        summary = dict(RULECLEANER_REPORT)

        self.currThread = current_thread()
        self.currThreadIdent = self.currThread.name
        self.updateReportDict(summary, "thread_id", self.currThreadIdent)
        self.resetCounters()
        self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode)

        # Build the list of workflows to work on:
        try:
            requestRecords = {}
            for status in reqStatus:
                requestRecords.update(self.getRequestRecords(status))
        except Exception as err:  # general error
            msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords
        try:
            self.getGlobalLocks()
            totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords)
            msg = "\nNumber of processed workflows: %s."
            msg += "\nNumber of properly cleaned workflows: %s."
            msg += "\nNumber of normally archived workflows: %s."
            msg += "\nNumber of force archived workflows: %s."
            self.logger.info(msg,
                             totalNumRequests,
                             cleanNumRequests,
                             normalArchivedNumRequests,
                             forceArchivedNumRequests)
            self.updateReportDict(summary, "total_num_requests", totalNumRequests)
            self.updateReportDict(summary, "clean_num_requests", cleanNumRequests)
            self.updateReportDict(summary, "normal_archived_num_requests", normalArchivedNumRequests)
            self.updateReportDict(summary, "force_archived_num_requests", forceArchivedNumRequests)
        except Exception as ex:
            msg = "Unknown exception while running MSRuleCleaner thread Error: %s"
            self.logger.exception(msg, str(ex))
            self.updateReportDict(summary, "error", msg)

        return summary

    def _execute(self, reqRecords):
        """
        Executes the MSRuleCleaner pipelines based on the workflow status
        :param reqList: A list of RequestRecords to work on
        :return:        a tuple with:
                            number of properly cleaned requests
                            number of processed workflows
                            number of archived workflows
        """
        # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival
        #       Pipelines are executed sequentially in the above order.
        #       This way we assure ourselves that we archive only workflows
        #       that have accomplished the needed cleanup

        cleanNumRequests = 0
        totalNumRequests = 0

        # Call the workflow dispatcher:
        for req in viewvalues(reqRecords):
            wflow = MSRuleCleanerWflow(req)
            self._dispatchWflow(wflow)
            msg = "\n----------------------------------------------------------"
            msg += "\nMSRuleCleanerWflow: %s"
            msg += "\n----------------------------------------------------------"
            self.logger.debug(msg, pformat(wflow))
            totalNumRequests += 1
            if self._checkClean(wflow):
                cleanNumRequests += 1

        # Report the counters:
        for pline in self.cleanuplines:
            msg = "Workflows cleaned by pipeline: %s: %d"
            self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name])
        normalArchivedNumRequests = self.wfCounters['archived']['normalArchived']
        forceArchivedNumRequests = self.wfCounters['archived']['forceArchived']
        self.logger.info("Workflows normally archived: %d", self.wfCounters['archived']['normalArchived'])
        self.logger.info("Workflows force archived: %d", self.wfCounters['archived']['forceArchived'])
        return totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests

    def _dispatchWflow(self, wflow):
        """
        A function intended to dispatch a workflow (e.g based on its status)
        through one or more functional pipelines in case there is some more
        complicated logic involved in the order we execute them but not just
        a sequentially
        """
        self.logger.debug("Dispatching workflow: %s", wflow['RequestName'])
        # NOTE: The following dispatch logic is a subject to be changed at any time

        # Resolve:
        # NOTE: First resolve any preliminary flags that will be needed further
        #       in the logic of the _dispatcher() itself
        if wflow['RequestStatus'] == 'announced':
            self.getMSOutputTransferInfo(wflow)

        # Clean:
        # Do not clean any Resubmission, but still let them be archived
        if wflow['RequestType'] == 'Resubmission':
            wflow['ForceArchive'] = True
            msg = "Skipping cleanup step for workflow: %s - RequestType is %s."
            msg += " Will try to archive it directly."
            self.logger.info(msg, wflow['RequestName'], wflow['RequestType'])
        elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']:
            # NOTE: We do not check the ParentageResolved flag for these
            #       workflows, but we do need to clean output data placement
            #       rules from the agents for them
            for pline in self.agentlines:
                try:
                    pline.run(wflow)
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        elif wflow['RequestStatus'] == 'announced' and not wflow['ParentageResolved']:
            # NOTE: We skip workflows which are not having 'ParentageResolved'
            #       flag, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced' and not wflow['TransferDone']:
            # NOTE: We skip workflows which have not yet finalised their TransferStatus
            #       in MSOutput, but we still need some proper logging for them.
            msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced' and not wflow['TransferTape']:
            # NOTE: We skip workflows which have not yet finalised their tape transfers.
            #       (i.e. even if a single output which is supposed to be covered
            #       by a tape rule is in any of the following transient states:
            #       {REPLICATING, STUCK, SUSPENDED, WAITING_APPROVAL}.)
            #       We still need some proper logging for them.
            msg = "Skipping workflow: %s - tape transfers are not yet completed."
            msg += " Will retry again in the next cycle."
            self.logger.info(msg, wflow['RequestName'])
        elif wflow['RequestStatus'] == 'announced':
            for pline in self.cleanuplines:
                try:
                    pline.run(wflow)
                except MSRuleCleanerResolveParentError as ex:
                    msg = "%s: Parentage Resolve Error: %s. "
                    msg += "Will retry again in the next cycle."
                    self.logger.error(msg, pline.name, str(ex))
                    continue
                except Exception as ex:
                    msg = "%s: General error from pipeline. Workflow: %s. Error:  \n%s. "
                    msg += "\nWill retry again in the next cycle."
                    self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex))
                    continue
                if wflow['CleanupStatus'][pline.name]:
                    self.wfCounters['cleaned'][pline.name] += 1
        else:
            # We shouldn't be here:
            msg = "Skipping workflow: %s - "
            msg += "Does not fall under any of the defined categories."
            self.logger.error(msg, wflow['RequestName'])

        # Archive:
        try:
            self.plineArchive.run(wflow)
            if wflow['ForceArchive']:
                self.wfCounters['archived']['forceArchived'] += 1
            else:
                self.wfCounters['archived']['normalArchived'] += 1
        except MSRuleCleanerArchivalSkip as ex:
            msg = "%s: Proper conditions not met: %s. "
            msg += "Skipping archival in the current cycle."
            self.logger.info(msg, wflow['PlineMarkers'][-1], str(ex))
        except MSRuleCleanerArchivalError as ex:
            msg = "%s: Archival Error: %s. "
            msg += "Will retry again in the next cycle."
            self.logger.error(msg, wflow['PlineMarkers'][-1], str(ex))
        except Exception as ex:
            msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. "
            msg += "\nWill retry again in the next cycle."
            self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex))

    def setPlineMarker(self, wflow, pName):
        """
        A function intended to mark which is the pipeline currently working
        on the workflow. It is supposed to be called always as a first function
        in the pipeline.
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  pName:   The name of the functional pipeline
        :return:         The workflow object
        """
        # NOTE: The current functional pipeline MUST always be appended at the
        #       end of the 'PlineMarkers' list

        # First get rid of the default:
        if not wflow['PlineMarkers']:
            wflow['PlineMarkers'] = []

        # Then push our current value into the markers list:
        wflow['PlineMarkers'].append(pName)

        # Populate the list of flags to be used later:
        if pName not in wflow['RulesToClean']:
            if pName in self.cleanupPipeNames:
                wflow['RulesToClean'][pName] = []
        if pName not in wflow['CleanupStatus']:
            if pName in self.cleanupPipeNames:
                wflow['CleanupStatus'][pName] = False
        return wflow

    def _checkClean(self, wflow):
        """
        An auxiliary function used to only check the temporary cleanup status.
        It basically takes the pipelines registered in 'PlineMarkers' that have
        already worked on the workflow as a mask and applies this mask over
        the set of flags in the 'CleanupStatus' field and then reduces the
        result to a single bool value
        """
        # NOTE: This is one of the few functions taking a workflow as an argument
        #       but returning a bool, since it is an auxiliary function and is not
        #       supposed to be called as a standalone function in a pipeline.
        # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows
        #       are also counted as properly cleaned and can trigger archival later

        # Build a list of bool flags based on the mask of PlineMarkers
        cleanFlagsList = [wflow['CleanupStatus'][key]
                          for key in wflow['PlineMarkers']
                          if key in wflow['CleanupStatus']]

        # If no one have worked on the workflow set the clean status to false
        if not wflow['PlineMarkers']:
            cleanStatus = False
        # If we have a mask longer than the list of flags avoid false positives
        # because of the behavior explained above - `all([]) == True`
        elif not cleanFlagsList:
            cleanStatus = False
        # Figure out the final value
        else:
            cleanStatus = all(cleanFlagsList)
        return cleanStatus

    def setClean(self, wflow):
        """
        A function to set the 'IsClean' flag based on the status from all the
        pipelines which have worked on the workflow (and have put their markers
        in the 'PlineMarkers' list)
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        wflow['IsClean'] = self._checkClean(wflow)
        return wflow

    def _checkLogDBClean(self, wflow):
        """
        An auxiliary function used to only check the LogDB cleanup status.
        It makes a query to LogDB in order to verify there are no any records for
        the current workflow
        :param wflow:       A MSRuleCleaner workflow representation
        :return:            True if no records were found in LogDB about wflow
        """
        cleanStatus = False
        logDBRecords = self.logDB.get(wflow['RequestName'])
        self.logger.debug("logDBRecords: %s", pformat(logDBRecords))
        if not logDBRecords:
            cleanStatus = True
        return cleanStatus

    def setLogDBClean(self, wflow):
        """
        A function to set the 'IsLogDBClean' flag based on the presence of any
        records in LogDB for the current workflow.
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        wflow['IsLogDBClean'] = self._checkLogDBClean(wflow)
        if not wflow['IsLogDBClean'] and wflow['IsArchivalDelayExpired']:
            wflow['IsLogDBClean'] = self._cleanLogDB(wflow)
        return wflow

    def _cleanLogDB(self, wflow):
        """
        A function to be used for cleaning all the records related to a workflow in logDB.
        :param wflow:       A MSRuleCleaner workflow representation
        :return:            True if NO errors were encountered while deleting
                            records from LogDB
        """
        cleanStatus = False
        try:
            if self.msConfig['enableRealMode']:
                self.logger.info("Deleting %s records from LogDB WMStats...", wflow['RequestName'])
                res = self.logDB.delete(wflow['RequestName'], agent=False)
                if res == 'delete-error':
                    msg = "Failed to delete logDB docs for wflow: %s" % wflow['RequestName']
                    raise MSRuleCleanerArchivalError(msg)
                cleanStatus = True
            else:
                self.logger.info("DRY-RUN: NOT Deleting %s records from LogDB WMStats...", wflow['RequestName'])
        except Exception as ex:
            msg = "General Exception while cleaning LogDB records for wflow: %s : %s"
            self.logger.exception(msg, wflow['RequestName'], str(ex))
        return cleanStatus

    def findTargetStatus(self, wflow):
        """
        Find the proper targeted archival status
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        # Check the available status transitions before we decide the final status
        targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(wflow['RequestStatus'], [])
        for status in targetStatusList:
            if self.targetStatusRegex.match(status):
                wflow['TargetStatus'] = status
        self.logger.debug("TargetStatus: %s", wflow['TargetStatus'])
        return wflow

    def _checkArchDelayExpired(self, wflow):
        """
        A function to check Archival Expiration Delay based on the information
        returned by WMStatsServer regarding the time of the last request status transition
        :param wflow:      MSRuleCleaner workflow representation
        :return:           True if the archival delay have been expired
        """
        archDelayExpired = False
        currentTime = int(time.time())
        threshold = self.msConfig['archiveDelayHours'] * 3600
        try:
            lastTransitionTime = wflow['RequestTransition'][-1]['UpdateTime']
            if lastTransitionTime and (currentTime - lastTransitionTime) > threshold:
                archDelayExpired = True
        except KeyError:
            self.logger.debug("Could not find status transition history for %s", wflow['RequestName'])
        return archDelayExpired

    def setArchivalDelayExpired(self, wflow):
        """
        A function to set the 'IsArchivalDelayExpired' flag
        """
        wflow['IsArchivalDelayExpired'] = self._checkArchDelayExpired(wflow)
        return wflow

    def archive(self, wflow):
        """
        Move the workflow to the proper archived status after checking
        the full cleanup status
        :param  wflow:      A MSRuleCleaner workflow representation
        :return:            The workflow object
        """
        # Make all the needed checks before trying to archive
        if not (wflow['IsClean'] or wflow['ForceArchive']):
            msg = "Not properly cleaned workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not wflow['TargetStatus']:
            msg = "Could not determine which archival status to target for workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalError(msg)
        if not wflow['IsLogDBClean']:
            msg = "LogDB records have not been cleaned for workflow: %s" % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not wflow['IsArchivalDelayExpired']:
            msg = "Archival delay period has not yet expired for workflow: %s." % wflow['RequestName']
            raise MSRuleCleanerArchivalSkip(msg)
        if not self.msConfig['enableRealMode']:
            msg = "Real Run Mode not enabled."
            raise MSRuleCleanerArchivalSkip(msg)

        # Proceed with the actual archival:
        try:
            self.reqmgr2.updateRequestStatus(wflow['RequestName'], wflow['TargetStatus'])
            msg = "Successful status transition to: %s for workflow: %s"
            self.logger.info(msg, wflow['TargetStatus'], wflow['RequestName'])
        except Exception as ex:
            msg = "General Exception while trying status transition to: %s " % wflow['TargetStatus']
            msg += "for workflow: %s : %s" % (wflow['RequestName'], str(ex))
            raise MSRuleCleanerArchivalError(msg)
        return wflow

    def getMSOutputTransferInfo(self, wflow):
        """
        Fetches the transfer information from the MSOutput REST interface for
        the given workflow.
        :param  wflow:   A MSRuleCleaner workflow representation
        :return:         The workflow object
        """
        headers = {'Accept': 'application/json'}
        params = {}
        url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'],
                                           wflow['RequestName'])
        try:
            res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert())
            data = json.loads(res)['result'][0]
            transferInfo = data['transferDoc']
        except Exception as ex:
            msg = "General exception while fetching TransferInfo from MSOutput for %s. "
            msg += "Error: %s"
            self.logger.exception(msg, wflow['RequestName'], str(ex))

        # Set Transfer status - information fetched from MSOutput only
        if transferInfo is not None and transferInfo['TransferStatus'] == 'done':
            wflow['TransferDone'] = True

        # Set Tape rules status - information fetched from Rucio (tape rule ids from MSOutput)
        if transferInfo is not None and transferInfo['OutputMap']:
            tapeRulesStatusList = []
            # For setting 'TransferTape' = True we require either no tape rules for the
            # workflow have been created or all existing tape rules to be in status 'OK',
            # so every empty TapeRuleID we consider as completed.
            for mapRecord in transferInfo['OutputMap']:
                if not mapRecord['TapeRuleID']:
                    continue
                rucioRule = self.rucio.getRule(mapRecord['TapeRuleID'])
                if not rucioRule:
                    tapeRulesStatusList.append(False)
                    msg = "Tape rule: %s not found for workflow: %s "
                    msg += "Possible server side error."
                    self.logger.error(msg, mapRecord['TapeRuleID'], wflow['RequestName'])
                    continue
                if rucioRule['state'] == 'OK':
                    tapeRulesStatusList.append(True)
                    msg = "Tape rule: %s in final state: %s for workflow: %s"
                    self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName'])
                else:
                    tapeRulesStatusList.append(False)
                    msg = "Tape rule: %s in non final state: %s for workflow: %s"
                    self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName'])
            if all(tapeRulesStatusList):
                wflow['TransferTape'] = True

        return wflow

    def setParentDatasets(self, wflow):
        """
        Used to resolve parent datasets for a workflow.
        :param  wflow:   A MSRuleCleaner workflow representation
        :return:         The workflow object
        """
        if wflow['InputDataset'] and wflow['IncludeParents']:
            childDataset = wflow['InputDataset']
            parentDataset = findParent([childDataset], self.msConfig['dbsUrl'])
            # NOTE: If findParent() returned None then the DBS service failed to
            #       resolve the request (it is considered an ERROR outside WMCore)
            if parentDataset.get(childDataset, None) is None:
                msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName'])
                raise MSRuleCleanerResolveParentError(msg)
            elif parentDataset:
                wflow['ParentDataset'] = [parentDataset[childDataset]]
                msg = "Found parent %s for input dataset %s in workflow: %s "
                self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName'])
            else:
                msg = "Could not find parent for input dataset: %s in workflows: %s"
                self.logger.error(msg, wflow['InputDataset'], wflow['RequestName'])
        return wflow

    def getRucioRules(self, wflow, gran, rucioAcct):
        """
        Queries Rucio and builds the relevant list of blocklevel rules for
        the given workflow
        :param  wflow:   A MSRuleCleaner workflow representation
        :param  gran:    Data granularity to search for Rucio rules. Possible values:
                         'block' or 'container'
        :return:         The workflow object
        """
        currPline = wflow['PlineMarkers'][-1]

        # Create the container list to the rucio account map and set the checkGlobalLocks flag.
        mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"],
                       self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup",
                                                           "DataPileup", "ParentDataset"]}
        if rucioAcct == self.msConfig['rucioMStrAccount']:
            checkGlobalLocks = True
        else:
            checkGlobalLocks = False

        # Find all the data placement rules created by the components:
        for dataType in mapRuleType[rucioAcct]:
            dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]]
            for dataCont in dataList:
                if dataCont is None:
                    continue
                self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont))
                if checkGlobalLocks and dataCont in self.globalLocks:
                    msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the "
                    msg += "RulesToClean list for both container and block level Rules for workflow: %s!"
                    self.logger.info(msg, dataCont, wflow['RequestName'])
                    continue
                if gran == 'container':
                    for rule in self.rucio.listDataRules(dataCont, account=rucioAcct):
                        wflow['RulesToClean'][currPline].append(rule['id'])
                        msg = "Found %s container-level rule to be deleted for container %s"
                        self.logger.info(msg, rule['id'], dataCont)
                elif gran == 'block':
                    try:
                        blocks = self.rucio.getBlocksInContainer(dataCont)
                        for block in blocks:
                            for rule in self.rucio.listDataRules(block, account=rucioAcct):
                                wflow['RulesToClean'][currPline].append(rule['id'])
                                msg = "Found %s block-level rule to be deleted for container %s"
                                self.logger.info(msg, rule['id'], dataCont)
                    except WMRucioDIDNotFoundException:
                        msg = "Container: %s not found in Rucio for workflow: %s."
                        self.logger.info(msg, dataCont, wflow['RequestName'])
        return wflow

    def cleanRucioRules(self, wflow):
        """
        Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner
        workflow representation. And fills the relevant Cleanup Status.
        :param wflow:   A MSRuleCleaner workflow representation
        :return:        The workflow object
        """
        # NOTE: The function should be called independently and sequentially from
        #       The Input and the respective BlockLevel pipelines.

        # NOTE: The current functional pipeline is always the last one in the PlineMarkers list
        currPline = wflow['PlineMarkers'][-1]
        delResults = []
        if self.msConfig['enableRealMode']:
            for rule in wflow['RulesToClean'][currPline]:
                self.logger.info("%s: Deleting ruleId: %s ", currPline, rule)
                delResult = self.rucio.deleteRule(rule)
                delResults.append(delResult)
                if not delResult:
                    self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule)
        else:
            for rule in wflow['RulesToClean'][currPline]:
                delResults.append(True)
                self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule)

        # Set the cleanup flag:
        wflow['CleanupStatus'][currPline] = all(delResults)
        return wflow

    def getRequestRecords(self, reqStatus):
        """
        Queries ReqMgr2 for requests in a given status.
        :param reqStatus: The status for the requests to be fetched from ReqMgr2
        :return requests: A dictionary with all the workflows in the given status
        """
        self.logger.info("Fetching requests in status: %s", reqStatus)
        result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True)
        if not result:
            requests = {}
        else:
            requests = result[0]
        self.logger.info('  retrieved %s requests in status: %s', len(requests), reqStatus)
        return requests
Пример #11
0
    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        # Tier0 Agent don't need this
        if hasattr(self.config, "Tier0Feeder"):
            self.wmstatsServerSvc = None
        else:
            wmstatsSvcURL = self.config.General.centralWMStatsURL.replace("couchdb/wmstats",
                                                                          "wmstatsserver")
            self.wmstatsServerSvc = WMStatsServer(wmstatsSvcURL)

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.updateBlocksDAO = daoFactory(classname="UpdateBlocks")
        self.updateFilesDAO = daoFactory(classname="UpdateFiles")
        self.createBlocksDAO = daoFactory(classname="CreateBlocks")
        self.setBlockFilesDAO = daoFactory(classname="SetBlockFiles")

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup", "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType", "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload, "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
                                     'dbsuploader_block.json')

        self.timeoutWaiver = 1

        self.datasetParentageCache = {}

        return
Пример #12
0
class DBSUploadPoller(BaseWorkerThread):
    """
    Handles poll-based DBSUpload

    """

    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        # Tier0 Agent don't need this
        if hasattr(self.config, "Tier0Feeder"):
            self.wmstatsServerSvc = None
        else:
            wmstatsSvcURL = self.config.General.centralWMStatsURL.replace("couchdb/wmstats",
                                                                          "wmstatsserver")
            self.wmstatsServerSvc = WMStatsServer(wmstatsSvcURL)

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.updateBlocksDAO = daoFactory(classname="UpdateBlocks")
        self.updateFilesDAO = daoFactory(classname="UpdateFiles")
        self.createBlocksDAO = daoFactory(classname="CreateBlocks")
        self.setBlockFilesDAO = daoFactory(classname="SetBlockFiles")

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup", "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType", "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload, "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
                                     'dbsuploader_block.json')

        self.timeoutWaiver = 1

        self.datasetParentageCache = {}

        return

    def setupPool(self):
        """
        _setupPool_

        Set up the processing pool for work
        """
        if len(self.pool) > 0:
            # Then something already exists.  Continue
            return

        self.workInput = multiprocessing.Queue()
        self.workResult = multiprocessing.Queue()

        # Starting up the pool:
        for _ in range(self.nProc):
            p = multiprocessing.Process(target=uploadWorker,
                                        args=(self.workInput,
                                              self.workResult,
                                              self.dbsUrl))
            p.start()
            self.pool.append(p)

        return

    def __del__(self):
        """
        __del__

        Trigger a close of connections if necessary
        """
        self.close()
        return

    def close(self):
        """
        _close_

        Kill all connections and terminate
        """
        terminate = False
        for _ in self.pool:
            try:
                self.workInput.put('STOP')
            except Exception as ex:
                # Something very strange happens here
                # It's like it raises a blank exception
                # Upon being told to return
                msg = "Hit some exception in deletion\n"
                msg += str(ex)
                logging.debug(msg)
                terminate = True
        try:
            self.workInput.close()
            self.workResult.close()
        except Exception:
            # What are you going to do?
            pass
        for proc in self.pool:
            if terminate:
                proc.terminate()
            else:
                proc.join()
        self.pool = []
        self.workInput = None
        self.workResult = None
        return

    def terminate(self, params):
        """
        Do one more pass, then terminate

        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        First, check blocks that may be already uploaded
        Then, load blocks
        Then, load files
        Then, move files into blocks
        Then add new blocks in DBSBuffer
        Then add blocks to DBS
        Then mark blocks as done in DBSBuffer
        """
        logging.info("Starting the DBSUpload Polling Cycle")
        # refreshing parentageCache every cycle
        if self.updateDatasetParentageCache() is False:
            return

        logging.debug("Dataset parentage map: %s", self.datasetParentageCache)
        try:
            self.checkBlocks()
            self.loadBlocks()
            self.loadFiles()
            self.checkBlockCompletion()
            self.inputBlocks()
            self.retrieveBlocks()
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled Exception in DBSUploadPoller! Error: %s" % str(ex)
            logging.exception(msg)
            raise DBSUploadException(msg)

    def updateDatasetParentageCache(self):
        """
        Return True to indicate it successfully fetched the parentage
        map. If there was an exception, return False
        """
        success = True
        if not self.wmstatsServerSvc:
            self.datasetParentageCache = {}
            return success

        try:
            self.datasetParentageCache = self.wmstatsServerSvc.getChildParentDatasetMap()
        except Exception as ex:
            reason = getattr(ex, 'reason', '')
            msg = 'Failed to fetch parentage map from WMStats, skipping this cycle.'
            if 'Service Unavailable' in reason or 'Proxy Error' in reason or\
                            'Error reading from remote server' in reason:
                msg += 'Error: %s' % reason
            elif 'Connection refused' in str(ex):
                msg += 'Error: %s' % str(ex)
            else:
                msg = "Unknown failure while fetching parentage map from WMStats. Error: %s" % str(ex)
                raise DBSUploadException(msg)

            logging.warning(msg)
            success = False

        return success

    def loadBlocks(self):
        """
        _loadBlocks_

        Find all blocks; make sure they're in the cache
        """
        openBlocks = self.dbsUtil.findOpenBlocks()
        logging.info("Found %d open blocks.", len(openBlocks))
        logging.debug("These are the openblocks: %s", openBlocks)

        # Load them if we don't have them
        blocksToLoad = []
        for block in openBlocks:
            if block['blockname'] not in self.blockCache.keys():
                blocksToLoad.append(block['blockname'])

        # Now load the blocks
        try:
            loadedBlocks = self.dbsUtil.loadBlocks(blocksToLoad)
            logging.info("Loaded %d blocks.", len(loadedBlocks))
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while loading blocks.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks to load: %s\n", blocksToLoad)
            raise DBSUploadException(msg)

        for blockInfo in loadedBlocks:
            block = DBSBufferBlock(name=blockInfo['block_name'],
                                   location=blockInfo['origin_site_name'],
                                   datasetpath=blockInfo['datasetpath'])

            parent = self.datasetParentageCache.get(blockInfo['datasetpath'])
            if parent:
                block.addDatasetParent(parent)
                logging.debug("Load block: Child dataset %s, Parent dataset %s", blockInfo['datasetpath'], parent)
            block.FillFromDBSBuffer(blockInfo)
            blockname = block.getName()

            # Now we have to load files...
            try:
                files = self.dbsUtil.loadFilesByBlock(blockname=blockname)
                logging.info("Have %i files for block %s", len(files), blockname)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading files for existing blocks.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks being loaded: %s\n", blockname)
                raise DBSUploadException(msg)

            # Add the loaded files to the block
            for f in files:
                block.addFile(f, self.datasetType, self.primaryDatasetType)

            # Add to the cache
            self.blockCache[blockInfo['block_name']] = block

        return

    def loadFiles(self):
        """
        _loadFiles_

        Load all files that need to be loaded.  I will do this by DatasetPath
        to break the monstrous calls down into smaller chunks.
        """
        dspList = self.dbsUtil.findUploadableDAS()

        readyBlocks = []
        for dspInfo in dspList:

            datasetpath = dspInfo['DatasetPath']

            # Get the files
            try:
                loadedFiles = self.dbsUtil.findUploadableFilesByDAS(datasetpath=datasetpath)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading uploadable files for DatasetPath.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("DatasetPath being loaded: %s\n", datasetpath)
                raise DBSUploadException(msg)

            # Sort the files and blocks by location
            fileDict = sortListByKey(loadedFiles, 'locations')

            # Now add each file
            for location in fileDict.keys():

                files = fileDict.get(location)

                if len(files) > 0:

                    currentBlock = self.getBlock(files[0], location, True)
                    currentBlock.setAcquisitionEra(era=dspInfo['AcquisitionEra'])
                    currentBlock.setProcessingVer(procVer=dspInfo['ProcessingVer'])

                    for newFile in files:

                        if not newFile.get('block', 1) is None:
                            # Then this file already has a block
                            # It should be accounted for somewhere
                            # Or loaded with the block
                            continue

                        # Check if we can put files in this block
                        if not self.isBlockOpen(newFile=newFile,
                                                block=currentBlock):
                            # Then we have to close the block and get a new one
                            currentBlock.setPendingAndCloseBlock()
                            readyBlocks.append(currentBlock)
                            currentBlock = self.getBlock(newFile=newFile,
                                                         location=location)
                            currentBlock.setAcquisitionEra(era=dspInfo['AcquisitionEra'])
                            currentBlock.setProcessingVer(procVer=dspInfo['ProcessingVer'])

                        # Now deal with the file
                        currentBlock.addFile(newFile, self.datasetType, self.primaryDatasetType)
                        self.filesToUpdate.append({'filelfn': newFile['lfn'],
                                                   'block': currentBlock.getName()})
                    # Done with the location
                    readyBlocks.append(currentBlock)

        for block in readyBlocks:
            self.blockCache[block.getName()] = block

        return

    def checkBlockCompletion(self):
        """
        _checkBlockCompletion_

        Mark Open blocks as Pending if they have timed out or their workflows have completed
        """
        completedWorkflows = self.dbsUtil.getCompletedWorkflows()
        for block in self.blockCache.values():
            if block.status == "Open":
                if (block.getTime() > block.getMaxBlockTime()) or any(
                        key in completedWorkflows for key in block.workflows):
                    block.setPendingAndCloseBlock()

        return

    def isBlockOpen(self, newFile, block, doTime=False):
        """
        _isBlockOpen_

        Check and see if a block is full
        This will check on time, but that's disabled by default
        The plan is to do a time check after we do everything else,
        so open blocks about to time out can still get more
        files put in them.
        """

        if block.getMaxBlockFiles() is None or block.getMaxBlockNumEvents() is None or \
                block.getMaxBlockSize() is None or block.getMaxBlockTime() is None:
            return True
        if block.status != 'Open':
            # Then somebody has dumped this already
            return False
        if block.getSize() + newFile['size'] > block.getMaxBlockSize():
            return False
        if block.getNumEvents() + newFile['events'] > block.getMaxBlockNumEvents():
            return False
        if block.getNFiles() >= block.getMaxBlockFiles():
            # Then we have to dump it because this file
            # will put it over the limit.
            return False
        if block.getTime() > block.getMaxBlockTime() and doTime:
            return False

        return True

    def getBlock(self, newFile, location, skipOpenCheck=False):
        """
        _getBlock_

        Retrieve a block is one exists with matching datasetpath/location and is open.
        If no such block is found create and return a new one.
        """
        datasetpath = newFile["datasetPath"]

        for block in self.blockCache.values():
            if datasetpath == block.getDatasetPath() and location == block.getLocation():
                if not self.isBlockOpen(newFile=newFile, block=block) and not skipOpenCheck:
                    # Block isn't open anymore.  Mark it as pending so that it gets uploaded.
                    block.setPendingAndCloseBlock()
                else:
                    return block

        # A suitable open block does not exist.  Create a new one.
        blockname = "%s#%s" % (datasetpath, makeUUID())
        newBlock = DBSBufferBlock(name=blockname,
                                  location=location,
                                  datasetpath=datasetpath)

        parent = self.datasetParentageCache.get(datasetpath)
        if parent:
            newBlock.addDatasetParent(parent)
            logging.debug("Get block: Child dataset %s, Parent dataset %s", datasetpath, parent)

        self.blockCache[blockname] = newBlock
        return newBlock

    def inputBlocks(self):
        """
        _inputBlocks_

        Loop through all of the "active" blocks and sort them so we can act
        appropriately on them.  Everything will be sorted based on the
        following:
         Queued - Block is already being acted on by another process.  We just
          ignore it.
         Pending, not in DBSBuffer - Block that has been closed and needs to
           be injected into DBS and also written to DBSBuffer.  We'll do both.
         Pending, in DBSBuffer - Block has been closed and written to
           DBSBuffer.  We just need to inject it into DBS.
         Open, not in DBSBuffer - Newly created block that needs to be written
           not DBSBuffer.
         Open, in DBSBuffer - Newly created block that has already been
           written to DBSBuffer.  We don't have to do anything with it.
        """
        if not self.blockCache:
            return

        myThread = threading.currentThread()

        createInDBS = []
        createInDBSBuffer = []
        updateInDBSBuffer = []
        for block in self.blockCache.values():
            if block.getName() in self.queuedBlocks:
                # Block is already being dealt with by another process.  We'll
                # ignore it here.
                continue
            if block.status == 'Pending':
                # All pending blocks need to be injected into DBS.
                createInDBS.append(block)

                # If this is a new block it needs to be added to DBSBuffer
                # otherwise it just needs to be updated in DBSBuffer.
                if not block.inBuff:
                    createInDBSBuffer.append(block)
                else:
                    updateInDBSBuffer.append(block)
            if block.status == 'Open' and not block.inBuff:
                # New block that needs to be added to DBSBuffer.
                createInDBSBuffer.append(block)

        # First handle new and updated blocks
        if len(createInDBSBuffer) > 0 or len(updateInDBSBuffer) > 0:
            try:
                myThread.transaction.begin()
                if len(createInDBSBuffer) > 0:
                    self.createBlocksDAO.execute(blocks=createInDBSBuffer,
                                                 conn=myThread.transaction.conn,
                                                 transaction=True)
                if len(updateInDBSBuffer) > 0:
                    self.updateBlocksDAO.execute(blocks=updateInDBSBuffer,
                                                 conn=myThread.transaction.conn,
                                                 transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while writing new blocks into DBSBuffer\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks for DBSBuffer: %s\n", createInDBSBuffer)
                logging.debug("Blocks for Update: %s\n", updateInDBSBuffer)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        # Update block status in the block cache.  Mark the blocks that we have
        # added to DBSBuffer as being in DBSBuffer.
        for block in createInDBSBuffer:
            self.blockCache.get(block.getName()).inBuff = True

        # Record new file/block associations in DBSBuffer.
        if len(self.filesToUpdate) > 0:
            try:
                myThread.transaction.begin()
                self.setBlockFilesDAO.execute(binds=self.filesToUpdate,
                                              conn=myThread.transaction.conn,
                                              transaction=True)
                self.filesToUpdate = []
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while setting blocks in files.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Files to Update: %s\n", self.filesToUpdate)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        if not createInDBS:
            # then there is nothing else to do
            return

        # Build the pool if it was closed
        if len(self.pool) == 0:
            self.setupPool()

        # Finally upload blocks to DBS.
        for block in createInDBS:
            if len(block.files) < 1:
                # What are we doing?
                logging.debug("Skipping empty block")
                continue
            if block.getDataset() is None:
                # Then we have to fix the dataset
                dbsFile = block.files[0]
                block.setDataset(datasetName=dbsFile['datasetPath'],
                                 primaryType=self.primaryDatasetType,
                                 datasetType=self.datasetType,
                                 physicsGroup=dbsFile.get('physicsGroup', None),
                                 prep_id=dbsFile.get('prep_id', None))
            logging.debug("Found block %s in blocks", block.getName())
            block.setPhysicsGroup(group=self.physicsGroup)

            encodedBlock = block.convertToDBSBlock()
            logging.info("About to insert block %s", block.getName())
            self.workInput.put({'name': block.getName(), 'block': encodedBlock})
            self.blockCount += 1
            if self.produceCopy:
                with open(self.copyPath, 'w') as jo:
                    json.dump(encodedBlock, jo, indent=2)
            self.queuedBlocks.append(block.getName())

        # And all work is in and we're done for now
        return

    def retrieveBlocks(self):
        """
        _retrieveBlocks_

        Once blocks are in DBS, we have to retrieve them and see what's
        in them.  What we do is get everything out of the result queue,
        and then update it in DBSBuffer.

        To do this, the result queue needs to pass back the blockname
        """
        myThread = threading.currentThread()

        blocksToClose = []
        emptyCount = 0
        while self.blockCount > 0:
            if emptyCount > self.nTries:

                # When timeoutWaiver is 0 raise error.
                # It could take long time to get upload data to DBS
                # if there are a lot of files are cumulated in the buffer.
                # in first try but second try should be faster.
                # timeoutWaiver is set as component variable - only resets when component restarted.
                # The reason for that is only back log will occur when component is down
                # for a long time while other component still running and feeding the data to
                # dbsbuffer

                if self.timeoutWaiver == 0:
                    msg = "Exceeded max number of waits while waiting for DBS to finish"
                    raise DBSUploadException(msg)
                else:
                    self.timeoutWaiver = 0
                    return
            try:
                # Get stuff out of the queue with a ridiculously
                # short wait time
                blockresult = self.workResult.get(timeout=self.wait)
                blocksToClose.append(blockresult)
                self.blockCount -= 1
                logging.debug("Got a block to close")
            except Queue.Empty:
                # This means the queue has no current results
                time.sleep(2)
                emptyCount += 1
                continue

        loadedBlocks = []
        for result in blocksToClose:
            # Remove from list of work being processed
            self.queuedBlocks.remove(result.get('name'))
            if result["success"] == "uploaded":
                block = self.blockCache.get(result.get('name'))
                block.status = 'InDBS'
                loadedBlocks.append(block)
            elif result["success"] == "check":
                block = result["name"]
                self.blocksToCheck.append(block)
            else:
                logging.error("Error found in multiprocess during process of block %s", result.get('name'))
                logging.error(result['error'])
                # Continue to the next block
                # Block will remain in pending status until it is transferred

        if len(loadedBlocks) > 0:
            try:
                myThread.transaction.begin()
                self.updateFilesDAO.execute(blocks=loadedBlocks, status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
                self.updateBlocksDAO.execute(blocks=loadedBlocks,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
            except Exception as ex:
                myThread.transaction.rollback()
                # possible deadlock with PhEDExInjector, retry once after 10s
                logging.warning("Oracle exception, possible deadlock due to race condition, retry after 10s sleep")
                time.sleep(10)
                try:
                    myThread.transaction.begin()
                    self.updateFilesDAO.execute(blocks=loadedBlocks, status="InDBS",
                                                conn=myThread.transaction.conn,
                                                transaction=True)
                    self.updateBlocksDAO.execute(blocks=loadedBlocks,
                                                 conn=myThread.transaction.conn,
                                                 transaction=True)
                except Exception as ex:
                    myThread.transaction.rollback()
                    msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Blocks for Update: %s\n", loadedBlocks)
                    raise DBSUploadException(msg)
                else:
                    myThread.transaction.commit()

            else:
                myThread.transaction.commit()

        for block in loadedBlocks:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean up the pool so we don't have stuff waiting around
        if len(self.pool) > 0:
            self.close()

        # And we're done
        return

    def checkBlocks(self):
        """
        _checkBlocks_

        Check with DBS3 if the blocks marked as check are
        uploaded or not.
        """
        myThread = threading.currentThread()

        blocksUploaded = []

        # See if there is anything to check
        for block in self.blocksToCheck:
            logging.debug("Checking block existence: %s", block)
            # Check in DBS if the block was really inserted
            try:
                result = self.dbsApi.listBlocks(block_name=block)
                # it is an empty list if block cannot be found
                if result:
                    loadedBlock = self.blockCache.get(block)
                    loadedBlock.status = 'InDBS'
                    blocksUploaded.append(loadedBlock)
            except Exception as ex:
                msg = "Error trying to check block %s through DBS. Error: %s" % (block, str(ex))
                logging.exception(msg)

        # Update the status of those blocks that were truly inserted
        if len(blocksUploaded) > 0:
            try:
                myThread.transaction.begin()
                self.updateBlocksDAO.execute(blocks=blocksUploaded,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
                self.updateFilesDAO.execute(blocks=blocksUploaded, status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                msg += str(ex)
                logging.exception(msg)
                logging.debug("Blocks for Update: %s\n", blocksUploaded)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        for block in blocksUploaded:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean the check list
        self.blocksToCheck = []

        # We're done
        return
Пример #13
0
class MSUnmerged(MSCore):
    """
    MSUnmerged.py class provides the logic for cleaning the unmerged area of
    the CMS LFN Namespace.
    """

    # @profile
    def __init__(self, msConfig, logger=None):
        """
        Runs the basic setup and initialization for the MSUnmerged module
        :param msConfig: micro service configuration
        """
        super(MSUnmerged, self).__init__(msConfig, logger=logger)

        self.msConfig.setdefault("verbose", True)
        self.msConfig.setdefault("interval", 60)
        self.msConfig.setdefault("limitFilesPerRSE", 200)
        self.msConfig.setdefault("skipRSEs", [])
        self.msConfig.setdefault("rseExpr", "*")
        self.msConfig.setdefault("enableRealMode", False)
        self.msConfig.setdefault("dumpRSE", False)
        self.msConfig.setdefault("gfalLogLevel", 'normal')
        self.msConfig.setdefault("dirFilterIncl", [])
        self.msConfig.setdefault("dirFilterExcl", [])
        self.msConfig.setdefault("emulateGfal2", False)
        self.msConfig.setdefault("filesToDeleteSliceSize", 100)
        if self.msConfig['emulateGfal2'] is False and gfal2 is None:
            msg = "Failed to import gfal2 library while it's not "
            msg += "set to emulate it. Crashing the service!"
            raise ImportError(msg)

        # TODO: Add 'alertManagerUrl' to msConfig'
        # self.alertServiceName = "ms-unmerged"
        # self.alertManagerAPI = AlertManagerAPI(self.msConfig.get("alertManagerUrl", None), logger=logger)

        # Instantiating the Rucio Consistency Monitor Client
        self.rucioConMon = RucioConMon(self.msConfig['rucioConMon'], logger=self.logger)

        self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger)

        # Building all the Pipelines:
        pName = 'plineUnmerged'
        self.plineUnmerged = Pipeline(name=pName,
                                      funcLine=[Functor(self.updateRSETimestamps, start=True, end=False),
                                                Functor(self.consRecordAge),
                                                Functor(self.getUnmergedFiles),
                                                Functor(self.filterUnmergedFiles),
                                                Functor(self.getPfn),
                                                Functor(self.cleanRSE),
                                                Functor(self.updateRSECounters, pName),
                                                Functor(self.updateRSETimestamps, start=False, end=True),
                                                Functor(self.purgeRseObj, dumpRSE=self.msConfig['dumpRSE'])])
        # Initialization of the deleted files counters:
        self.rseCounters = {}
        self.plineCounters = {}
        self.rseTimestamps = {}
        self.rseConsStats = {}
        self.protectedLFNs = []

        # The basic /store/unmerged regular expression:
        self.regStoreUnmergedLfn = re.compile("^/store/unmerged/.*$")
        self.regStoreUnmergedPfn = re.compile("^.+/store/unmerged/.*$")

    # @profile
    def execute(self):
        """
        Executes the whole MSUnmerged logic
        :return: summary
        """
        # start threads in MSManager which should call this method
        summary = dict(UNMERGED_REPORT)

        # fetch the protectedLFNs list
        try:
            self.protectedLFNs = set(self.wmstatsSvc.getProtectedLFNs())
            # self.logger.debug("protectedLFNs: %s", pformat(self.protectedLFNs))

            if not self.protectedLFNs:
                msg = "Could not fetch the protectedLFNs list from WMStatServer. "
                msg += "Skipping the current run."
                self.logger.error(msg)
                return summary
        except Exception as ex:
            msg = "Unknown exception while trying to fetch the protectedLFNs list from WMStatServer. Error: {}".format(str(ex))
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        # refresh statistics on every poling cycle
        try:
            self.rseConsStats = self.rucioConMon.getRSEStats()
            # self.logger.debug("protectedLFNs: %s", pformat(self.protectedLFNs))

            if not self.rseConsStats:
                msg = "Could not fetch statistics from Rucio Consistency Monitor. "
                msg += "Skipping the current run."
                self.logger.error(msg)
                return summary
        except Exception as ex:
            msg = "Unknown exception while trying to fetch statistics from Rucio Consistency Monitor. Error: {}".format(str(ex))
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        try:
            rseList = self.getRSEList()
            msg = "Retrieved list of %s RSEs: %s "
            msg += "Service set to process up to %s RSEs per instance."
            self.logger.info(msg, len(rseList), pformat(rseList), self.msConfig["limitRSEsPerInstance"])
            random.shuffle(rseList)
        except Exception as err:  # general error
            msg = "Unknown exception while trying to estimate the final list of RSEs to work on. Error: {}".format(str(err))
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        try:
            totalNumRses, totalNumFiles, numRsesCleaned, numFilesDeleted = self._execute(rseList)
            msg = "\nTotal number of RSEs processed: %s."
            msg += "\nTotal number of files fetched from RucioConMon: %s."
            msg += "\nNumber of RSEs cleaned: %s."
            msg += "\nNumber of files deleted: %s."
            self.logger.info(msg,
                             totalNumRses,
                             totalNumFiles,
                             numRsesCleaned,
                             numFilesDeleted)
            self.updateReportDict(summary, "total_num_rses", totalNumRses)
            self.updateReportDict(summary, "total_num_files", totalNumFiles)
            self.updateReportDict(summary, "num_rses_cleaned", numRsesCleaned)
            self.updateReportDict(summary, "num_files_deleted", numFilesDeleted)
        except Exception as ex:
            msg = "Unknown exception while running MSUnmerged thread Error: {}".format(str(ex))
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)

        return summary

    # @profile
    def _execute(self, rseList):
        """
        Executes the MSUnmerged pipelines
        :param rseList: A list of RSEs to work on
        :return:        a tuple with:
                            total number of RSEs
                            total number of files found for deletion
                            number of RSEs cleaned
                            number of files deleted
        """

        pline = self.plineUnmerged
        self.resetCounters(plineName=pline.name)
        self.plineCounters[pline.name]['totalNumRses'] = len(rseList)

        for rseName in rseList:
            try:
                if rseName not in self.msConfig['skipRSEs']:
                    pline.run(MSUnmergedRSE(rseName))
                else:
                    msg = "%s: Run on RSE: %s is skipped due to a restriction set in msConfig. "
                    msg += "Will NOT retry until the RSE is removed from 'skipRSEs' list."
                    self.logger.info(msg, pline.name, rseName)
                    continue
            except MSUnmergedPlineExit as ex:
                msg = "%s: Run on RSE: %s was interrupted due to: %s "
                msg += "Will retry again in the next cycle."
                self.logger.warning(msg, pline.name, rseName, ex.message)
                continue
            except Exception as ex:
                msg = "%s: General error from pipeline. RSE: %s. Error: %s "
                msg += "Will retry again in the next cycle."
                self.logger.exception(msg, pline.name, rseName, str(ex))
                continue
        return self.plineCounters[pline.name]['totalNumRses'], \
            self.plineCounters[pline.name]['totalNumFiles'], \
            self.plineCounters[pline.name]['rsesCleaned'], \
            self.plineCounters[pline.name]['deletedSuccess']

    # @profile
    def cleanRSE(self, rse):
        """
        The method to implement the actual deletion of files for an RSE.
        :param rse: MSUnmergedRSE object to be cleaned
        :return:    The MSUnmergedRSE object
        """

        # Create the gfal2 context object:
        try:
            ctx = createGfal2Context(self.msConfig['gfalLogLevel'], self.msConfig['emulateGfal2'])
        except Exception as ex:
            msg = "RSE: %s, Failed to create gfal2 Context object. " % rse['name']
            msg += "Skipping it in the current run."
            self.logger.exception(msg)
            raise MSUnmergedPlineExit(msg)

        # Start cleaning one directory at a time:
        for dirLfn, fileLfnGen in rse['files']['toDelete'].items():
            if self.msConfig['limitFilesPerRSE'] < 0 or \
               rse['counters']['filesToDelete'] < self.msConfig['limitFilesPerRSE']:

                # First increment the dir counter:
                rse['counters']['dirsToDelete'] += 1

                # Now we consume the rse['files']['toDelete'][dirLfn] generator
                # upon that no values will be left in it. In case we need it again
                # we will have to recreate the filter as we did in self.filterUnmergedFiles()
                pfnList = []
                if not rse['pfnPrefix']:
                    # Fall back to calling Rucio on a per directory basis for
                    # resolving the lfn to pfn mapping
                    dirPfn = self.rucio.getPFN(rse['name'], dirLfn, operation='delete')[dirLfn]
                    for fileLfn in fileLfnGen:
                        fileLfnSuffix = fileLfn.split(dirLfn)[1]
                        filePfn = dirPfn + fileLfnSuffix
                        pfnList.append(filePfn)
                else:
                    # Proceed with assembling the full filePfn out of the rse['pfnPrefix'] and the fileLfn
                    dirPfn = rse['pfnPrefix'] + dirLfn
                    for fileLfn in fileLfnGen:
                        filePfn = rse['pfnPrefix'] + fileLfn
                        pfnList.append(filePfn)

                rse['counters']['filesToDelete'] += len(pfnList)
                msg = "\nRSE: %s \nDELETING: %s."
                msg += "\nPFN list with: %s entries: \n%s"
                self.logger.debug(msg, rse['name'], dirLfn, len(pfnList), twFormat(pfnList, maxLength=4))

                if self.msConfig['enableRealMode']:
                    try:
                        # execute the actual deletion in bulk - full list of files per directory

                        deletedSuccess = 0
                        for pfnSlice in list(grouper(pfnList, self.msConfig["filesToDeleteSliceSize"])):
                            delResult = ctx.unlink(pfnSlice)
                            # Count all the successfully deleted files (if a deletion was
                            # successful a value of None is put in the delResult list):
                            deletedSuccess += sum([1 for pfnStatus in delResult if pfnStatus is None])

                        self.logger.debug("RSE: %s, Dir: %s, deletedSuccess: %s",
                                          rse['name'], dirLfn, deletedSuccess)
                        rse['counters']['deletedSuccess'] = deletedSuccess

                        # Now clean the whole branch
                        self.logger.debug("Purging dirEntry: %s:\n", dirPfn)
                        purgeSuccess = self._purgeTree(ctx, dirPfn)
                        if not purgeSuccess:
                            msg = "RSE: %s Failed to purge nonEmpty directory: %s"
                            self.logger.error(msg, rse['name'], dirPfn)
                    except Exception as ex:
                        msg = "Error while cleaning RSE: %s. "
                        msg += "Will retry in the next cycle. Err: %s"
                        self.logger.warning(msg, rse['name'], str(ex))
        rse['isClean'] = self._checkClean(rse)

        return rse

    def _purgeTree(self, ctx, baseDirPfn):
        """
        A method to be used for purging the tree bellow a specific branch.
        It deletes every empty directory bellow that branch + the origin at the end.
        :param ctx:  The gfal2 context object
        :return:     Bool: True if it managed to purge everything, False otherwise
        """
        successList = []

        if baseDirPfn[-1] != '/':
            baseDirPfn += '/'

        for dirEntry in ctx.listdir(baseDirPfn):
            if dirEntry in ['.', '..']:
                continue
            dirEntryPfn = baseDirPfn + dirEntry
            try:
                entryStat = ctx.stat(dirEntryPfn)
            except gfal2.GError:
                e = sys.exc_info()[1]
                if e.code == errno.ENOENT:
                    self.logger.error("MISSING dirEntry: %s", dirEntryPfn)
                    successList.append(False)
                    return all(successList)
                else:
                    self.logger.error("FAILED dirEntry: %s", dirEntryPfn)
                    raise
            if stat.S_ISDIR(entryStat.st_mode):
                successList.append(self._purgeTree(ctx, dirEntryPfn))

        try:
            success = ctx.rmdir(baseDirPfn)
            # for gfal2 rmdir() exit status of 0 is success
            if success == 0:
                successList.append(True)
            else:
                successList.append(False)
            self.logger.debug("RM baseDir: %s", baseDirPfn)
        except gfal2.GError:
            e = sys.exc_info()[1]
            if e.code == errno.ENOENT:
                self.logger.error("MISSING baseDir: %s", baseDirPfn)
            else:
                self.logger.error("FAILED basedir: %s", baseDirPfn)
                raise
        return all(successList)

    def _checkClean(self, rse):
        """
        A simple function to check if every file in an RSE's unmerged area have
        been deleted
        :param rse: The RSE to be checked
        :return:    Bool: True if all files found have been deleted, False otherwise
        """
        return rse['counters']['filesToDelete'] == rse['counters']['deletedSuccess']

    def consRecordAge(self, rse):
        """
        A method to heck the duration of the consistency record for the RSE
        :param rse: The RSE to be checked
        :return:    rse or raises MSUnmergedPlineExit
        """
        rseName = rse['name']

        if rseName not in self.rseConsStats:
            msg = "RSE: %s Missing in stats records at Rucio Consistency Monitor. " % rseName
            msg += "Skipping it in the current run."
            self.logger.warning(msg)
            raise MSUnmergedPlineExit(msg)

        isConsDone = self.rseConsStats[rseName]['status'] == 'done'
        isConsNewer = self.rseConsStats[rseName]['end_time'] > self.rseTimestamps[rseName]['prevStartTime']
        isRootFailed = self.rseConsStats[rseName]['root_failed']

        if not isConsNewer:
            msg = "RSE: %s With old consistency record in Rucio Consistency Monitor. " % rseName
            msg += "Skipping it in the current run."
            self.logger.info(msg)
            raise MSUnmergedPlineExit(msg)
        if not isConsDone:
            msg = "RSE: %s In non-final state in Rucio Consistency Monitor. " % rseName
            msg += "Skipping it in the current run."
            self.logger.warning(msg)
            raise MSUnmergedPlineExit(msg)
        if isRootFailed:
            msg = "RSE: %s With failed root in Rucio Consistency Monitor. " % rseName
            msg += "Skipping it in the current run."
            self.logger.warning(msg)
            raise MSUnmergedPlineExit(msg)

        return rse

    # @profile
    def getUnmergedFiles(self, rse):
        """
        Fetches all the records of unmerged files per RSE from Rucio Consistency Monitor
        and cuts everything to a certain level in the path and puts the list in the rse obj.

        Path example:
        /store/unmerged/Run2016B/JetHT/MINIAOD/ver2_HIPM_UL2016_MiniAODv2-v2/140000/388E3DEF-9F15-D04C-B582-7DD036D9DD33.root

        Where:
        /store/unmerged/                       - root unmerged area
        /Run2016B                              - acquisition era
        /JetHT                                 - primary dataset
        /MINIAOD                               - data tier
        /ver2_HIPM_UL2016_MiniAODv2-v2         - processing string + processing version
        /140000/388E3DEF-...-7DD036D9DD33.root - to be cut off

        :param rse: The RSE to work on
        :return:    rse
        """
        rse['files']['allUnmerged'] = self.rucioConMon.getRSEUnmerged(rse['name'])
        for filePath in rse['files']['allUnmerged']:
            rse['counters']['totalNumFiles'] += 1
            # Check if what we start with is under /store/unmerged/*
            if self.regStoreUnmergedLfn.match(filePath):
                # Cut the path to the deepest level known to WMStats protected LFNs
                dirPath = self._cutPath(filePath)
                # Check if what is left is still under /store/unmerged/*
                if self.regStoreUnmergedLfn.match(dirPath):
                    # Add it to the set of allUnmerged
                    rse['dirs']['allUnmerged'].add(dirPath)
        return rse

    def _cutPath(self, filePath):
        """
        Cuts a file path to the deepest level known to WMStats protected LFNs
        :param filePath:   The full (absolute) file path together with the file name
        :return finalPath: The final path cut the to correct level
        """
        # pylint: disable=E1120
        # This is a known issue when when passing an unpacked list to a method expecting
        # at least one variable. In this case the signature of the method breaking the
        # rule is:
        # os.path.join(*newPath) != os.path.join(a, *p)

        # Split the initial filePath into chunks and fill it into a dictionary
        # containing only directory names and the root of the path e.g.
        # ['/', 'store', 'unmerged', 'RunIISummer20UL17SIM', ...]
        newPath = []
        root = filePath
        while True:
            root, tail = os.path.split(root)
            if tail:
                newPath.append(tail)
            else:
                newPath.append(root)
                break
        newPath.reverse()
        # Cut/slice the path to the level/element required.
        newPath = newPath[:7]
        # Build the path out of all that is found up to the deepest level in the LFN tree
        finalPath = os.path.join(*newPath)
        return finalPath

    # @profile
    def filterUnmergedFiles(self, rse):
        """
        This method is applying set compliment operation to the set of unmerged
        files per RSE in order to exclude the protected LFNs.
        :param rse: The RSE to work on
        :return:    rse
        """
        rse['dirs']['toDelete'] = rse['dirs']['allUnmerged'] - self.protectedLFNs
        rse['dirs']['protected'] = rse['dirs']['allUnmerged'] & self.protectedLFNs

        # The following check may seem redundant, but better stay safe than sorry
        if not (rse['dirs']['toDelete'] | rse['dirs']['protected']) == rse['dirs']['allUnmerged']:
            rse['counters']['toDelete'] = -1
            msg = "Incorrect set check while trying to estimate the final set for deletion."
            raise MSUnmergedPlineExit(msg)

        # Get rid of 'allUnmerged' directories
        rse['dirs']['allUnmerged'].clear()

        # NOTE: Here we may want to filter out all protected files from allUnmerged and leave just those
        #       eligible for deletion. This will minimize the iteration time of the filters
        #       from toDelete later on.
        # while rse['files']['allUnmerged'

        # Now create the filters for rse['files']['toDelete'] - those should be pure generators
        # A simple generator:
        def genFunc(pattern, iterable):
            for i in iterable:
                if i.startswith(pattern):
                    yield i

        # NOTE: If the 'dirFilterIncl' is non empty then the cleaning process will
        #       be enclosed only in this part of the tree and will ignore anything
        #       from /store/unmerged/ which does not belong to the included filter
        # NOTE: 'dirFilterExcl' is always applied.

        # Merge the additional filters into a final set to be applied:
        dirFilterIncl = set(self.msConfig['dirFilterIncl'])
        dirFilterExcl = set(self.msConfig['dirFilterExcl'])

        # Update directory/files with no service filters
        if not dirFilterIncl and not dirFilterExcl:
            for dirName in rse['dirs']['toDelete']:
                rse['files']['toDelete'][dirName] = genFunc(dirName, rse['files']['allUnmerged'])
            rse['counters']['dirsToDeleteAll'] = len(rse['files']['toDelete'])
            return rse

        # If we are here, then there are service filters...
        for dirName in rse['dirs']['toDelete']:
            # apply exclusion filter
            dirFilterExclMatch = []
            for pathExcl in dirFilterExcl:
                dirFilterExclMatch.append(dirName.startswith(pathExcl))
            if any(dirFilterExclMatch):
                # then it matched one of the exclusion paths
                continue
            if not dirFilterIncl:
                # there is no inclusion filter, simply add this directory/files
                rse['files']['toDelete'][dirName] = genFunc(dirName, rse['files']['allUnmerged'])
                continue

            # apply inclusion filter
            for pathIncl in dirFilterIncl:
                if dirName.startswith(pathIncl):
                    rse['files']['toDelete'][dirName] = genFunc(dirName, rse['files']['allUnmerged'])
                    break

        # Update the counters:
        rse['counters']['dirsToDeleteAll'] = len(rse['files']['toDelete'])
        return rse

    def getPfn(self, rse):
        """
        A method for fetching the common Pfn (method + hostname + global path)
        for the RSE. It uses Rucio client method lfns2pfns for one of the LFNs
        already recorded in the RSE in order to resolve the lfn to pfn mapping
        and then tries to parse the resultant pfn and cut off the lfn part.
        :param rse: The RSE to be checked
        :return:    rse
        """
        # NOTE:  pfnPrefix here is considered the full part of the pfn up to the
        #        beginning of the lfn part rather than just the protocol prefix
        if rse['files']['allUnmerged']:
            lfn = next(iter(rse['files']['allUnmerged']))
            pfnDict = self.rucio.getPFN(rse['name'], lfn, operation='delete')
            pfnFull = pfnDict[lfn]
            if self.regStoreUnmergedPfn.match(pfnFull):
                pfnPrefix = pfnFull.split('/store/unmerged/')[0]
                rse['pfnPrefix'] = pfnPrefix
            else:
                msg = "Could not establish the correct pfn Prefix for RSE: %s. " % rse['name']
                msg += "Will fall back to calling Rucio on a directory basis for lfn to pfn resolution."
                self.logger.warning(msg)
        return rse

    # @profile
    def purgeRseObj(self, rse, dumpRSE=False):
        """
        Cleaning all the records in an RSE object. The final method to be used
        before an RSE exits a pipeline.
        :param rse: The RSE to be checked
        :return:    rse
        """
        msg = "\n----------------------------------------------------------"
        msg += "\nMSUnmergedRSE: \n%s"
        msg += "\n----------------------------------------------------------"
        if dumpRSE:
            self.logger.debug(msg, pformat(rse))
        else:
            self.logger.debug(msg, twFormat(rse, maxLength=6))
        rse.clear()
        return rse

    def updateRSETimestamps(self, rse, start=True, end=True):
        """
        Update/Upload all timestamps for the rse object into the MSUnmerged
        service counters
        :param rse:   The RSE to work on
        :return:      rse
        """
        rseName = rse['name']
        currTime = time()

        if rseName not in self.rseTimestamps:
            self.rseTimestamps[rseName] = {'prevStartTime': 0.0,
                                           'startTime': 0.0,
                                           'prevEndtime': 0.0,
                                           'endTime': 0.0}
        if start:
            self.rseTimestamps[rseName]['prevStartTime'] = self.rseTimestamps[rseName]['startTime']
            self.rseTimestamps[rseName]['startTime'] = currTime
        if end:
            self.rseTimestamps[rseName]['prevEndtime'] = self.rseTimestamps[rseName]['endTime']
            self.rseTimestamps[rseName]['endtime'] = currTime
        return rse

    def updateRSECounters(self, rse, pName):
        """
        Update/Upload all counters from the rse object into the MSUnmerged
        service counters
        :param rse:   The RSE to work on
        :param pName: The pipeline name whose counters to be updated
        :return:      rse
        """
        rseName = rse['name']
        self.resetCounters(rseName=rseName)
        self.rseCounters[rseName]['totalNumFiles'] = rse['counters']['totalNumFiles']
        self.rseCounters[rseName]['deletedSuccess'] = rse['counters']['deletedSuccess']
        self.rseCounters[rseName]['deletedFail'] = rse['counters']['deletedFail']

        self.plineCounters[pName]['totalNumFiles'] += rse['counters']['totalNumFiles']
        self.plineCounters[pName]['deletedSuccess'] += rse['counters']['deletedSuccess']
        self.plineCounters[pName]['deletedFail'] += rse['counters']['deletedFail']
        self.plineCounters[pName]['rsesProcessed'] += 1
        if rse['isClean']:
            self.plineCounters[pName]['rsesCleaned'] += 1

        return rse

    def resetCounters(self, rseName=None, plineName=None):
        """
        A simple function for zeroing the service counters.
        :param rseName:   RSE Name whose counters to be zeroed
        :param plineName: The Pline Name whose counters to be zeroed
        """

        # Resetting Just the RSE Counters
        if rseName is not None:
            if rseName not in self.rseCounters:
                self.rseCounters[rseName] = {}
            self.rseCounters[rseName]['totalNumFiles'] = 0
            self.rseCounters[rseName]['deletedSuccess'] = 0
            self.rseCounters[rseName]['deletedFail'] = 0
            return

        # Resetting Just the pline counters
        if plineName is not None:
            if plineName not in self.plineCounters:
                self.plineCounters[plineName] = {}
            self.plineCounters[plineName]['totalNumFiles'] = 0
            self.plineCounters[plineName]['deletedSuccess'] = 0
            self.plineCounters[plineName]['deletedFail'] = 0
            self.plineCounters[plineName]['totalNumRses'] = 0
            self.plineCounters[plineName]['rsesProcessed'] = 0
            self.plineCounters[plineName]['rsesCleaned'] = 0
            return

        # Resetting all counters
        for rseName in self.rseCounters:
            self.rseCounters[rseName]['totalNumFiles'] = 0
            self.rseCounters[rseName]['deletedSuccess'] = 0
            self.rseCounters[rseName]['deletedFail'] = 0

        for plineName in self.plineCounters:
            self.plineCounters[plineName]['totalNumFiles'] = 0
            self.plineCounters[plineName]['deletedSuccess'] = 0
            self.plineCounters[plineName]['deletedFail'] = 0
            self.plineCounters[plineName]['totalNumRses'] = 0
            self.plineCounters[plineName]['rsesProcessed'] = 0
            self.plineCounters[plineName]['rsesCleaned'] = 0
        return

    # @profile
    def getRSEList(self):
        """
        Queries Rucio for the proper RSE list to iterate through.
        :return: List of RSE names.
        """
        try:
            rseList = self.rucio.evaluateRSEExpression(self.msConfig['rseExpr'])
        except Exception as ex:
            msg = "Unknown exception while trying to fetch the initial list of RSEs to work on. Err: %s"
            self.logger.exception(msg, str(ex))
            rseList = []
        return rseList