def main(): Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 2: Script.showHelp() from DIRAC import exit as DIRACExit, gLogger lfn = args[0] se = args[1] from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({'LFN': lfn, 'SE': se}) if not res['OK']: gLogger.error(res['Message']) cacheReplicaInfo = res['Value'] if cacheReplicaInfo: replicaID = list(cacheReplicaInfo)[0] outStr = "\n--------------------" outStr += "\n%s: %s" % ('LFN'.ljust(8), cacheReplicaInfo[replicaID]['LFN'].ljust(100)) outStr += "\n%s: %s" % ('SE'.ljust(8), cacheReplicaInfo[replicaID]['SE'].ljust(100)) outStr += "\n%s: %s" % ('PFN'.ljust(8), cacheReplicaInfo[replicaID]['PFN'].ljust(100)) outStr += "\n%s: %s" % ('Status'.ljust(8), cacheReplicaInfo[replicaID]['Status'].ljust(100)) outStr += "\n%s: %s" % ('LastUpdate'.ljust(8), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust(100)) outStr += "\n%s: %s" % ('Reason'.ljust(8), str(cacheReplicaInfo[replicaID]['Reason']).ljust(100)) resTasks = client.getTasks({'ReplicaID': replicaID}) if resTasks['OK']: # print resTasks['Message'] outStr += '\nJob IDs requesting this file to be staged:'.ljust(8) tasks = resTasks['Value'] for tid in tasks.keys(): outStr += ' %s ' % (tasks[tid]['SourceTaskID']) resStageRequests = client.getStageRequests({'ReplicaID': replicaID}) if not resStageRequests['OK']: gLogger.error(resStageRequests['Message']) if resStageRequests['Records']: stageRequests = resStageRequests['Value'] outStr += "\n------SRM staging request info--------------" for info in stageRequests.values(): outStr += "\n%s: %s" % ('SRM RequestID'.ljust(8), info['RequestID'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageStatus'.ljust(8), info['StageStatus'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestSubmitTime'.ljust(8), str(info['StageRequestSubmitTime']).ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestCompletedTime'.ljust(8), str(info['StageRequestCompletedTime']).ljust(100)) outStr += "\n%s: %s" % ('SRM PinExpiryTime'.ljust(8), str(info['PinExpiryTime']).ljust(100)) outStr += "\n%s: %s sec" % ('SRM PinLength'.ljust(8), str(info['PinLength']).ljust(100)) else: outStr += '\nThere are no staging requests submitted to the site yet.'.ljust(8) else: outStr = "\nThere is no such file requested for staging. Check for typo's!" # Script.showHelp() gLogger.notice(outStr) DIRACExit(0)
class RequestPreparationAgent( AgentModule ): def initialize( self ): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "There were no New replicas found" ) return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles( replicas ) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize( exist ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas( fileSizes.keys() ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get( lfn ) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error( "Missing replicas information", "%s %s" % ( lfn, requestedSEs ) ) continue for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop( requestedSE ) else: replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) ) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation( replicaMetadata ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] ) return S_OK() def __getNewReplicas( self ): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas( {'Status':'New'} ) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) ) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] replicas.setdefault( lfn, {} )[storageElement] = replicaID replicaIDs[replicaID] = ( lfn, storageElement ) return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} ) def __getExistingFiles( self, lfns ): """ This checks that the files exist in the FileCatalog. """ res = self.fileCatalog.exists( list( set( lfns ) ) ) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] ) return res failed = res['Value']['Failed'] success = res['Value']['Successful'] exist = [lfn for lfn, exists in success.items() if exists] missing = list( set( success ) - set( exist ) ) if missing: reason = 'LFN not registered in the FC' gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, '\n'.join( [''] + missing ) ) self.__reportProblematicFiles( missing, 'LFN-LFC-DoesntExist' ) missing = dict.fromkeys( missing, reason ) else: missing = {} return S_OK( {'Exist':exist, 'Missing':missing, 'Failed':failed} ) def __getFileSize( self, lfns ): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn ) self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' ) return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} ) def __getFileReplicas( self, lfns ): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len( lfnReplicas.keys() ) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn ) self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' ) return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'RequestPreparationAgent' ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
class StageMonitorAgent(AgentModule): def initialize(self): self.stagerClient = StorageManagerClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = getProxyInfo(disableVOMS=True) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests(self): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len(replicaIDs)) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests(storageElement, seReplicaIDs, replicaIDs) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests(self, storageElement, seReplicaIDs, replicaIDs): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] # Since we are in a given SE, the LFN is a unique key lfnRepIDs = {} lfnReqIDs = {} for replicaID in seReplicaIDs: lfn = replicaIDs[replicaID]['LFN'] lfnRepIDs[lfn] = replicaID requestID = replicaIDs[replicaID].get('RequestID', None) if requestID: lfnReqIDs[lfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % (len(lfnRepIDs), storageElement)) oAccounting = DataOperation() oAccounting.setStartTime() res = StorageElement(storageElement).getFileMetadata(lfnReqIDs) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message']) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict(storageElement) for lfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search('File does not exist', reason): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement", lfn) terminalReplicaIDs[ lfnRepIDs[lfn]] = 'LFN did not exist in the StorageElement' for lfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append(lfnRepIDs[lfn]) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append(lfnRepIDs[lfn]) # only ReplicaIDs oAccounting.setValuesFromDict(accountingDict) oAccounting.setEndTime() gDataStoreClient.addRegister(oAccounting) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message']) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len(stagedReplicas)) res = self.stagerClient.setStageComplete(stagedReplicas) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message']) res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged') if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message']) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len(oldRequests)) res = self.__wakeupOldRequests(oldRequests) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message']) return def __newAccountingDict(self, storageElement): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas(self): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas({'Status': 'StageSubmitted'}) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len(res['Value'])) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key(storageElement): seReplicas[storageElement] = [] seReplicas[storageElement].append(replicaID) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {'ReplicaID': replicaIDs.keys()}) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % (replicaIDs.keys())) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs}) def __wakeupOldRequests(self, oldRequests): gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...") retryInterval = self.am_getOption('RetryIntervalHour', 2) res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message']) return res return S_OK()
' LFN: LFN of the staging file', ' SE: Storage Element for the staging file' ])) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 2: Script.showHelp() from DIRAC import exit as DIRACExit, gLogger lfn = args[0] se = args[1] from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({'LFN': lfn, 'SE': se}) if not res['OK']: gLogger.error(res['Message']) cacheReplicaInfo = res['Value'] if cacheReplicaInfo: replicaID = cacheReplicaInfo.keys()[0] outStr = "\n--------------------" outStr += "\n%s: %s" % ('LFN'.ljust(8), cacheReplicaInfo[replicaID]['LFN'].ljust(100)) outStr += "\n%s: %s" % ('SE'.ljust(8), cacheReplicaInfo[replicaID]['SE'].ljust(100)) outStr += "\n%s: %s" % ('PFN'.ljust(8), cacheReplicaInfo[replicaID]['PFN'].ljust(100)) outStr += "\n%s: %s" % ('Status'.ljust(8), cacheReplicaInfo[replicaID]['Status'].ljust(100)) outStr += "\n%s: %s" % ('LastUpdate'.ljust(
def run(): from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} if 'status' in switchDict: queryDict['Status'] = str(switchDict['status']) if 'se' in switchDict: queryDict['SE'] = str(switchDict['se']) # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = '1903-08-02 06:24:38' # select newer than if 'limit' in switchDict: gLogger.notice("Query limited to %s entries" % switchDict['limit']) res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict['limit'])) else: res = client.getCacheReplicas(queryDict) if not res['OK']: gLogger.error(res['Message']) outStr = "\n" if res['Records']: replicas = res['Value'] outStr += " %s" % ("Status".ljust(15)) outStr += " %s" % ("LastUpdate".ljust(20)) outStr += " %s" % ("LFN".ljust(80)) outStr += " %s" % ("SE".ljust(10)) outStr += " %s" % ("Reason".ljust(10)) if 'showJobs' in switchDict: outStr += " %s" % ("Jobs".ljust(10)) outStr += " %s" % ("PinExpiryTime".ljust(15)) outStr += " %s" % ("PinLength(sec)".ljust(15)) outStr += "\n" for crid, info in replicas.iteritems(): outStr += " %s" % (info['Status'].ljust(15)) outStr += " %s" % (str(info['LastUpdate']).ljust(20)) outStr += " %s" % (info['LFN'].ljust(30)) outStr += " %s" % (info['SE'].ljust(15)) outStr += " %s" % (str(info['Reason']).ljust(10)) # Task info if 'showJobs' in switchDict: resTasks = client.getTasks({'ReplicaID': crid}) if resTasks['OK']: if resTasks['Value']: tasks = resTasks['Value'] jobs = [] for tid in tasks: jobs.append(tasks[tid]['SourceTaskID']) outStr += ' %s ' % (str(jobs).ljust(10)) else: outStr += ' %s ' % (" --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({'ReplicaID': crid}) if not resStageRequests['OK']: gLogger.error(resStageRequests['Message']) if resStageRequests['Records']: stageRequests = resStageRequests['Value'] for info in stageRequests.itervalues(): outStr += " %s" % (str(info['PinExpiryTime']).ljust(20)) outStr += " %s" % (str(info['PinLength']).ljust(10)) outStr += "\n" gLogger.notice(outStr) else: gLogger.notice("No entries")
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "DataManager") return S_OK() def execute(self): """This is the first logical task to be executed and manages the New->Waiting transition of the Replicas""" res = self.__getNewReplicas() if not res["OK"]: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res["Message"] ) return res if not res["Value"]: gLogger.info("There were no New replicas found") return res replicas = res["Value"]["Replicas"] replicaIDs = res["Value"]["ReplicaIDs"] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles(replicas) if not res["OK"]: return res exist = res["Value"]["Exist"] terminal = res["Value"]["Missing"] failed = res["Value"]["Failed"] if not exist: gLogger.error("RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file") return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info("RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroSize"] fileSizes = res["Value"]["FileSizes"] if not fileSizes: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine sizes of any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(list(fileSizes)) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroReplicas"] fileReplicas = res["Value"]["Replicas"] if not fileReplicas: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine replicas for any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get(lfn) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error("Missing replicas information", "%s %s" % (lfn, requestedSEs)) continue for requestedSE, replicaID in requestedSEs.items(): if requestedSE not in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append((replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res["Message"] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res["Message"] ) return S_OK() def __getNewReplicas(self): """This obtains the New replicas from the Replicas table and for each LFN the requested storage element""" # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({"Status": "New"}) if not res["OK"]: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res["Message"] ) return res if not res["Value"]: gLogger.debug("RequestPreparation.__getNewReplicas: No New replicas found to process.") return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res["Value"]) ) replicas = {} replicaIDs = {} for replicaID, info in res["Value"].items(): lfn = info["LFN"] storageElement = info["SE"] replicas.setdefault(lfn, {})[storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({"Replicas": replicas, "ReplicaIDs": replicaIDs}) def __getExistingFiles(self, lfns): """This checks that the files exist in the FileCatalog.""" res = self.fileCatalog.exists(list(set(lfns))) if not res["OK"]: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res["Message"] ) return res failed = res["Value"]["Failed"] success = res["Value"]["Successful"] exist = [lfn for lfn, exists in success.items() if exists] missing = list(set(success) - set(exist)) if missing: reason = "LFN not registered in the FC" gLogger.warn("RequestPreparation.__getExistingFiles: %s" % reason, "\n".join([""] + missing)) self.__reportProblematicFiles(missing, "LFN-LFC-DoesntExist") missing = dict.fromkeys(missing, reason) else: missing = {} return S_OK({"Exist": exist, "Missing": missing, "Failed": failed}) def __getFileSize(self, lfns): """This obtains the file size from the FileCatalog.""" fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileSize: Failed to get sizes for files.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, size in res["Value"]["Successful"].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), "LFN-LFC-ZeroSize") return S_OK({"FileSizes": fileSizes, "ZeroSize": zeroSize, "Failed": failed}) def __getFileReplicas(self, lfns): """This obtains the replicas from the FileCatalog.""" replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, lfnReplicas in res["Value"]["Successful"].items(): if len(lfnReplicas) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn("RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(list(noReplicas), "LFN-LFC-NoReplicas") return S_OK({"Replicas": replicas, "ZeroReplicas": noReplicas, "Failed": failed}) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic(lfns, reason, sourceComponent="RequestPreparationAgent") if not res["OK"]: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res["Message"] ) return res if res["Value"]["Successful"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res["Value"]["Successful"]) ) if res["Value"]["Failed"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res["Value"]["Failed"]) ) return res
' %s LFN SE ...' % Script.scriptName, 'Arguments:', ' LFN: LFN of the staging file \n', ' SE: Storage Element for the staging file \n' ] ) ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getPositionalArgs() if len( args ) < 2: Script.showHelp() lfn = args[0] se = args[1] from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas( {'LFN':lfn,'SE':se} ) if not res['OK']: print res['Message'] cacheReplicaInfo = res['Value'] if cacheReplicaInfo: replicaID = cacheReplicaInfo.keys()[0] outStr = "\n--------------------" outStr = "%s\n%s: %s" % ( outStr, 'LFN'.ljust( 8 ), cacheReplicaInfo[replicaID]['LFN'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'SE'.ljust( 8 ), cacheReplicaInfo[replicaID]['SE'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'PFN'.ljust( 8 ), cacheReplicaInfo[replicaID]['PFN'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'Status'.ljust( 8 ), cacheReplicaInfo[replicaID]['Status'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'LastUpdate'.ljust( 8 ), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'Reason'.ljust( 8 ), str( cacheReplicaInfo[replicaID]['Reason']).ljust( 100 ) ) resTasks = client.getTasks({'ReplicaID':replicaID})
" SE: Storage Element for the staging file \n", ] ) ) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 2: Script.showHelp() lfn = args[0] se = args[1] from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({"LFN": lfn, "SE": se}) if not res["OK"]: print res["Message"] cacheReplicaInfo = res["Value"] if cacheReplicaInfo: replicaID = cacheReplicaInfo.keys()[0] outStr = "\n--------------------" outStr = "%s\n%s: %s" % (outStr, "LFN".ljust(8), cacheReplicaInfo[replicaID]["LFN"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "SE".ljust(8), cacheReplicaInfo[replicaID]["SE"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "PFN".ljust(8), cacheReplicaInfo[replicaID]["PFN"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "Status".ljust(8), cacheReplicaInfo[replicaID]["Status"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "LastUpdate".ljust(8), str(cacheReplicaInfo[replicaID]["LastUpdate"]).ljust(100)) outStr = "%s\n%s: %s" % (outStr, "Reason".ljust(8), str(cacheReplicaInfo[replicaID]["Reason"]).ljust(100)) resTasks = client.getTasks({"ReplicaID": replicaID})
def run(): from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} dictKeys = switchDict.keys() if 'status' in dictKeys: queryDict['Status'] = str(switchDict['status']) if 'se' in dictKeys: queryDict['SE'] = str(switchDict['se']); # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = '1903-08-02 06:24:38' # select newer than if 'limit' in dictKeys: print "Query limited to %s entries" %switchDict['limit'] res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict['limit'])) else: res = client.getCacheReplicas(queryDict) if not res['OK']: print res['Message'] outStr ="\n" if res['Records']: replicas = res['Value'] outStr = "%s %s" %(outStr, "Status".ljust(15)) outStr = "%s %s" %(outStr, "LastUpdate".ljust(20)) outStr = "%s %s" %(outStr, "LFN".ljust(80)) outStr = "%s %s" %(outStr, "SE".ljust(10)) outStr = "%s %s" %(outStr, "Reason".ljust(10)) if 'showJobs' in dictKeys: outStr = "%s %s" %(outStr, "Jobs".ljust(10)) outStr = "%s %s" %(outStr, "PinExpiryTime".ljust(15)) outStr = "%s %s" %(outStr, "PinLength(sec)".ljust(15)) outStr = "%s\n" % outStr for crid in replicas.keys(): outStr = "%s %s" %(outStr, replicas[crid]['Status'].ljust( 15 )) outStr = "%s %s" %(outStr, str(replicas[crid]['LastUpdate']).ljust( 20 )) outStr = "%s %s" %(outStr, replicas[crid]['LFN'].ljust( 30 )) outStr = "%s %s" %(outStr, replicas[crid]['SE'].ljust( 15 )) outStr = "%s %s" %(outStr, str(replicas[crid]['Reason']).ljust( 10 )) # Task info if 'showJobs' in dictKeys: resTasks = client.getTasks({'ReplicaID':crid}) if resTasks['OK']: if resTasks['Value']: tasks = resTasks['Value'] jobs = [] for tid in tasks.keys(): jobs.append(tasks[tid]['SourceTaskID']) outStr = '%s %s ' % (outStr, str(jobs).ljust(10)) else: outStr = '%s %s ' % (outStr, " --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({'ReplicaID':crid}) if not resStageRequests['OK']: print resStageRequests['Message'] if resStageRequests['Records']: stageRequests = resStageRequests['Value'] for srid in stageRequests.keys(): outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinExpiryTime']).ljust( 20 )) outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinLength']).ljust( 10 )) outStr = "%s\n" % outStr print outStr else: print "No entries"
def run(): global subLogger from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} if "status" in switchDict: queryDict["Status"] = str(switchDict["status"]) if "se" in switchDict: queryDict["SE"] = str(switchDict["se"]) # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = "1903-08-02 06:24:38" # select newer than if "limit" in switchDict: gLogger.notice("Query limited to %s entries" % switchDict["limit"]) res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict["limit"])) else: res = client.getCacheReplicas(queryDict) if not res["OK"]: gLogger.error(res["Message"]) outStr = "\n" if res["Records"]: replicas = res["Value"] outStr += " %s" % ("Status".ljust(15)) outStr += " %s" % ("LastUpdate".ljust(20)) outStr += " %s" % ("LFN".ljust(80)) outStr += " %s" % ("SE".ljust(10)) outStr += " %s" % ("Reason".ljust(10)) if "showJobs" in switchDict: outStr += " %s" % ("Jobs".ljust(10)) outStr += " %s" % ("PinExpiryTime".ljust(15)) outStr += " %s" % ("PinLength(sec)".ljust(15)) outStr += "\n" for crid, info in replicas.items(): outStr += " %s" % (info["Status"].ljust(15)) outStr += " %s" % (str(info["LastUpdate"]).ljust(20)) outStr += " %s" % (info["LFN"].ljust(30)) outStr += " %s" % (info["SE"].ljust(15)) outStr += " %s" % (str(info["Reason"]).ljust(10)) # Task info if "showJobs" in switchDict: resTasks = client.getTasks({"ReplicaID": crid}) if resTasks["OK"]: if resTasks["Value"]: tasks = resTasks["Value"] jobs = [] for tid in tasks: jobs.append(tasks[tid]["SourceTaskID"]) outStr += " %s " % (str(jobs).ljust(10)) else: outStr += " %s " % (" --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({"ReplicaID": crid}) if not resStageRequests["OK"]: gLogger.error(resStageRequests["Message"]) if resStageRequests["Records"]: stageRequests = resStageRequests["Value"] for info in stageRequests.values(): outStr += " %s" % (str( info["PinExpiryTime"]).ljust(20)) outStr += " %s" % (str(info["PinLength"]).ljust(10)) outStr += "\n" gLogger.notice(outStr) else: gLogger.notice("No entries")
class StageMonitorAgent(AgentModule): def initialize(self): self.stagerClient = StorageManagerClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "DataManager") self.storagePlugins = self.am_getOption("StoragePlugins", []) self.dataOpSender = DataOperationSender() return S_OK() def execute(self): res = getProxyInfo(disableVOMS=True) if not res["OK"]: return res self.proxyInfoDict = res["Value"] return self.monitorStageRequests() def monitorStageRequests(self): """This is the third logical task manages the StageSubmitted->Staged transition of the Replicas""" res = self.__getStageSubmittedReplicas() if not res["OK"]: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res["Message"]) return res if not res["Value"]: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res["Value"]["SEReplicas"] replicaIDs = res["Value"]["ReplicaIDs"] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len(replicaIDs)) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests(storageElement, seReplicaIDs, replicaIDs) return self.dataOpSender.concludeSending() def __monitorStorageElementStageRequests(self, storageElement, seReplicaIDs, replicaIDs): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] # Since we are in a given SE, the LFN is a unique key lfnRepIDs = {} for replicaID in seReplicaIDs: lfn = replicaIDs[replicaID]["LFN"] lfnRepIDs[lfn] = replicaID if lfnRepIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % (len(lfnRepIDs), storageElement)) else: gLogger.warn( "StageMonitor.__monitorStorageElementStageRequests: No requests to monitor for %s." % storageElement) return startTime = datetime.datetime.utcnow() res = StorageElement( storageElement, plugins=self.storagePlugins).getFileMetadata(lfnRepIDs) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas", res["Message"], ) return prestageStatus = res["Value"] accountingDict = self.__newAccountingDict(storageElement) for lfn, reason in prestageStatus["Failed"].items(): accountingDict["TransferTotal"] += 1 if re.search("File does not exist", reason): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement", lfn) terminalReplicaIDs[ lfnRepIDs[lfn]] = "LFN did not exist in the StorageElement" for lfn, metadata in prestageStatus["Successful"].items(): if not metadata: continue staged = metadata.get("Cached", metadata["Accessible"]) if staged: accountingDict["TransferTotal"] += 1 accountingDict["TransferOK"] += 1 accountingDict["TransferSize"] += metadata["Size"] stagedReplicas.append(lfnRepIDs[lfn]) elif staged is not None: oldRequests.append(lfnRepIDs[lfn]) # only ReplicaIDs # Check if sending data operation to Monitoring self.dataOpSender.sendData(accountingDict, startTime=startTime, endTime=datetime.datetime.utcnow()) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res["Message"], ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len(stagedReplicas)) res = self.stagerClient.setStageComplete(stagedReplicas) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res["Message"], ) res = self.stagerClient.updateReplicaStatus( stagedReplicas, "Staged") if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res["Message"], ) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len(oldRequests)) res = self.__wakeupOldRequests(oldRequests) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res["Message"]) return def __newAccountingDict(self, storageElement): """Generate a new accounting Dict""" accountingDict = {} accountingDict["OperationType"] = "Stage" accountingDict["User"] = self.proxyInfoDict["username"] accountingDict["Protocol"] = "Stager" accountingDict["RegistrationTime"] = 0.0 accountingDict["RegistrationOK"] = 0 accountingDict["RegistrationTotal"] = 0 accountingDict["FinalStatus"] = "Successful" accountingDict["Source"] = storageElement accountingDict["Destination"] = storageElement accountingDict["ExecutionSite"] = siteName() accountingDict["TransferTotal"] = 0 accountingDict["TransferOK"] = 0 accountingDict["TransferSize"] = 0 accountingDict["TransferTime"] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas(self): """This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas({"Status": "StageSubmitted"}) if not res["OK"]: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res["Message"], ) return res if not res["Value"]: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len(res["Value"])) seReplicas = {} replicaIDs = res["Value"] for replicaID, info in replicaIDs.items(): storageElement = info["SE"] seReplicas.setdefault(storageElement, []).append(replicaID) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {"ReplicaID": list(replicaIDs)}) if not res["OK"]: return res if not res["Value"]: return S_ERROR( "Could not obtain request IDs for replicas %s from StageRequests table" % list(replicaIDs)) for replicaID, info in res["Value"].items(): replicaIDs[replicaID]["RequestID"] = info["RequestID"] return S_OK({"SEReplicas": seReplicas, "ReplicaIDs": replicaIDs}) def __wakeupOldRequests(self, oldRequests): gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...") retryInterval = self.am_getOption("RetryIntervalHour", 2) res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval) if not res["OK"]: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res["Message"]) return res return S_OK()
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = self.prepareNewReplicas() return res def prepareNewReplicas(self): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message']) return res if not res['Value']: gLogger.info("There were no New replicas found") return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)) # Check that the files exist in the FileCatalog res = self.__getExistingFiles(replicas.keys()) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal)) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(fileSizes.keys()) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal)) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[ replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append( (replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message']) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message']) return S_OK() def __getNewReplicas(self): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({'Status': 'New'}) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message']) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res['Value'])) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] if not replicas.has_key(lfn): replicas[lfn] = {} replicas[lfn][storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({'Replicas': replicas, 'ReplicaIDs': replicaIDs}) def __getExistingFiles(self, lfns): """ This checks that the files exist in the FileCatalog. """ filesExist = [] missing = {} res = self.fileCatalog.exists(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message']) return res failed = res['Value']['Failed'] for lfn, exists in res['Value']['Successful'].items(): if exists: filesExist.append(lfn) else: missing[lfn] = 'LFN not registered in the FileCatalog' if missing: for lfn, reason in missing.items(): gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, lfn) self.__reportProblematicFiles(missing.keys(), 'LFN-LFC-DoesntExist') return S_OK({ 'Exist': filesExist, 'Missing': missing, 'Failed': failed }) def __getFileSize(self, lfns): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message']) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[ lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), 'LFN-LFC-ZeroSize') return S_OK({ 'FileSizes': fileSizes, 'ZeroSize': zeroSize, 'Failed': failed }) def __getFileReplicas(self, lfns): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message']) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len(lfnReplicas.keys()) == 0: noReplicas[ lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(noReplicas.keys(), 'LFN-LFC-NoReplicas') return S_OK({ 'Replicas': replicas, 'ZeroReplicas': noReplicas, 'Failed': failed }) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent='RequestPreparationAgent') if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
class StageMonitorAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() #self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): res = getProxyInfo( disableVOMS = True ) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests( self ): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) ) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs ) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] pfnRepIDs = {} pfnReqIDs = {} for replicaID in seReplicaIDs: pfn = replicaIDs[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID requestID = replicaIDs[replicaID].get( 'RequestID', None ) if requestID: pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) oAccounting = DataOperation() oAccounting.setStartTime() res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] ) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict( storageElement ) for pfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search( 'File does not exist', reason ): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement' for pfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append( pfnRepIDs[pfn] ) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append( pfnRepIDs[pfn] ); #only ReplicaIDs oAccounting.setValuesFromDict( accountingDict ) oAccounting.setEndTime() gDataStoreClient.addRegister( oAccounting ) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) ) res = self.stagerClient.setStageComplete( stagedReplicas ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] ) res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged' ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] ) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len( oldRequests ) ) res = self.__wakeupOldRequests( oldRequests ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message'] ) return def __newAccountingDict( self, storageElement ): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas( self ): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas( {'Status':'StageSubmitted'} ) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) ) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {'ReplicaID':replicaIDs.keys()} ) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) ) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'StageMonitorAgent' ) if not res['OK']: gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res def __wakeupOldRequests( self, oldRequests ): gLogger.info( "StageMonitor.__wakeupOldRequests: Attempting..." ) retryInterval = self.am_getOption( 'RetryIntervalHour', 2 ) res = self.stagerClient.wakeupOldRequests( oldRequests, retryInterval ) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message'] ) return res return S_OK()
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument("LFN: LFN of the staging file") Script.registerArgument("SE: Storage Element for the staging file") Script.parseCommandLine(ignoreErrors=True) from DIRAC import exit as DIRACExit, gLogger lfn, se = Script.getPositionalArgs(group=True) from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({"LFN": lfn, "SE": se}) if not res["OK"]: gLogger.error(res["Message"]) cacheReplicaInfo = res["Value"] if cacheReplicaInfo: replicaID = list(cacheReplicaInfo)[0] outStr = "\n--------------------" outStr += "\n%s: %s" % ("LFN".ljust(8), cacheReplicaInfo[replicaID]["LFN"].ljust(100)) outStr += "\n%s: %s" % ("SE".ljust(8), cacheReplicaInfo[replicaID]["SE"].ljust(100)) outStr += "\n%s: %s" % ("PFN".ljust(8), cacheReplicaInfo[replicaID]["PFN"].ljust(100)) outStr += "\n%s: %s" % ("Status".ljust(8), cacheReplicaInfo[replicaID]["Status"].ljust(100)) outStr += "\n%s: %s" % ("LastUpdate".ljust(8), str(cacheReplicaInfo[replicaID]["LastUpdate"]).ljust(100)) outStr += "\n%s: %s" % ("Reason".ljust(8), str(cacheReplicaInfo[replicaID]["Reason"]).ljust(100)) resTasks = client.getTasks({"ReplicaID": replicaID}) if resTasks["OK"]: # print resTasks['Message'] outStr += "\nJob IDs requesting this file to be staged:".ljust(8) tasks = resTasks["Value"] for tid in tasks.keys(): outStr += " %s " % (tasks[tid]["SourceTaskID"]) resStageRequests = client.getStageRequests({"ReplicaID": replicaID}) if not resStageRequests["OK"]: gLogger.error(resStageRequests["Message"]) if resStageRequests["Records"]: stageRequests = resStageRequests["Value"] outStr += "\n------SRM staging request info--------------" for info in stageRequests.values(): outStr += "\n%s: %s" % ("SRM RequestID".ljust(8), info["RequestID"].ljust(100)) outStr += "\n%s: %s" % ("SRM StageStatus".ljust(8), info["StageStatus"].ljust(100)) outStr += "\n%s: %s" % ( "SRM StageRequestSubmitTime".ljust(8), str(info["StageRequestSubmitTime"]).ljust(100), ) outStr += "\n%s: %s" % ( "SRM StageRequestCompletedTime".ljust(8), str(info["StageRequestCompletedTime"]).ljust(100), ) outStr += "\n%s: %s" % ("SRM PinExpiryTime".ljust(8), str(info["PinExpiryTime"]).ljust(100)) outStr += "\n%s: %s sec" % ("SRM PinLength".ljust(8), str(info["PinLength"]).ljust(100)) else: outStr += "\nThere are no staging requests submitted to the site yet.".ljust(8) else: outStr = "\nThere is no such file requested for staging. Check for typo's!" # Script.showHelp() gLogger.notice(outStr) DIRACExit(0)