def main(): Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) < 2: Script.showHelp() from DIRAC import exit as DIRACExit, gLogger lfn = args[0] se = args[1] from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({'LFN': lfn, 'SE': se}) if not res['OK']: gLogger.error(res['Message']) cacheReplicaInfo = res['Value'] if cacheReplicaInfo: replicaID = list(cacheReplicaInfo)[0] outStr = "\n--------------------" outStr += "\n%s: %s" % ('LFN'.ljust(8), cacheReplicaInfo[replicaID]['LFN'].ljust(100)) outStr += "\n%s: %s" % ('SE'.ljust(8), cacheReplicaInfo[replicaID]['SE'].ljust(100)) outStr += "\n%s: %s" % ('PFN'.ljust(8), cacheReplicaInfo[replicaID]['PFN'].ljust(100)) outStr += "\n%s: %s" % ('Status'.ljust(8), cacheReplicaInfo[replicaID]['Status'].ljust(100)) outStr += "\n%s: %s" % ('LastUpdate'.ljust(8), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust(100)) outStr += "\n%s: %s" % ('Reason'.ljust(8), str(cacheReplicaInfo[replicaID]['Reason']).ljust(100)) resTasks = client.getTasks({'ReplicaID': replicaID}) if resTasks['OK']: # print resTasks['Message'] outStr += '\nJob IDs requesting this file to be staged:'.ljust(8) tasks = resTasks['Value'] for tid in tasks.keys(): outStr += ' %s ' % (tasks[tid]['SourceTaskID']) resStageRequests = client.getStageRequests({'ReplicaID': replicaID}) if not resStageRequests['OK']: gLogger.error(resStageRequests['Message']) if resStageRequests['Records']: stageRequests = resStageRequests['Value'] outStr += "\n------SRM staging request info--------------" for info in stageRequests.values(): outStr += "\n%s: %s" % ('SRM RequestID'.ljust(8), info['RequestID'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageStatus'.ljust(8), info['StageStatus'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestSubmitTime'.ljust(8), str(info['StageRequestSubmitTime']).ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestCompletedTime'.ljust(8), str(info['StageRequestCompletedTime']).ljust(100)) outStr += "\n%s: %s" % ('SRM PinExpiryTime'.ljust(8), str(info['PinExpiryTime']).ljust(100)) outStr += "\n%s: %s sec" % ('SRM PinLength'.ljust(8), str(info['PinLength']).ljust(100)) else: outStr += '\nThere are no staging requests submitted to the site yet.'.ljust(8) else: outStr = "\nThere is no such file requested for staging. Check for typo's!" # Script.showHelp() gLogger.notice(outStr) DIRACExit(0)
class StageMonitorAgent(AgentModule): def initialize(self): self.stagerClient = StorageManagerClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = getProxyInfo(disableVOMS=True) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests(self): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len(replicaIDs)) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests(storageElement, seReplicaIDs, replicaIDs) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests(self, storageElement, seReplicaIDs, replicaIDs): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] # Since we are in a given SE, the LFN is a unique key lfnRepIDs = {} lfnReqIDs = {} for replicaID in seReplicaIDs: lfn = replicaIDs[replicaID]['LFN'] lfnRepIDs[lfn] = replicaID requestID = replicaIDs[replicaID].get('RequestID', None) if requestID: lfnReqIDs[lfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % (len(lfnRepIDs), storageElement)) oAccounting = DataOperation() oAccounting.setStartTime() res = StorageElement(storageElement).getFileMetadata(lfnReqIDs) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message']) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict(storageElement) for lfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search('File does not exist', reason): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement", lfn) terminalReplicaIDs[ lfnRepIDs[lfn]] = 'LFN did not exist in the StorageElement' for lfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append(lfnRepIDs[lfn]) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append(lfnRepIDs[lfn]) # only ReplicaIDs oAccounting.setValuesFromDict(accountingDict) oAccounting.setEndTime() gDataStoreClient.addRegister(oAccounting) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message']) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len(stagedReplicas)) res = self.stagerClient.setStageComplete(stagedReplicas) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message']) res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged') if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message']) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len(oldRequests)) res = self.__wakeupOldRequests(oldRequests) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message']) return def __newAccountingDict(self, storageElement): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas(self): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas({'Status': 'StageSubmitted'}) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len(res['Value'])) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key(storageElement): seReplicas[storageElement] = [] seReplicas[storageElement].append(replicaID) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {'ReplicaID': replicaIDs.keys()}) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % (replicaIDs.keys())) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs}) def __wakeupOldRequests(self, oldRequests): gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...") retryInterval = self.am_getOption('RetryIntervalHour', 2) res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message']) return res return S_OK()
cacheReplicaInfo[replicaID]['Status'].ljust(100)) outStr += "\n%s: %s" % ('LastUpdate'.ljust( 8), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust(100)) outStr += "\n%s: %s" % ('Reason'.ljust( 8), str(cacheReplicaInfo[replicaID]['Reason']).ljust(100)) resTasks = client.getTasks({'ReplicaID': replicaID}) if resTasks['OK']: # print resTasks['Message'] outStr += '\nJob IDs requesting this file to be staged:'.ljust(8) tasks = resTasks['Value'] for tid in tasks.keys(): outStr += ' %s ' % (tasks[tid]['SourceTaskID']) resStageRequests = client.getStageRequests({'ReplicaID': replicaID}) if not resStageRequests['OK']: gLogger.error(resStageRequests['Message']) if resStageRequests['Records']: stageRequests = resStageRequests['Value'] outStr += "\n------SRM staging request info--------------" for info in stageRequests.itervalues(): outStr += "\n%s: %s" % ('SRM RequestID'.ljust(8), info['RequestID'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageStatus'.ljust(8), info['StageStatus'].ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestSubmitTime'.ljust( 8), str(info['StageRequestSubmitTime']).ljust(100)) outStr += "\n%s: %s" % ('SRM StageRequestCompletedTime'.ljust(
def run(): from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} if 'status' in switchDict: queryDict['Status'] = str(switchDict['status']) if 'se' in switchDict: queryDict['SE'] = str(switchDict['se']) # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = '1903-08-02 06:24:38' # select newer than if 'limit' in switchDict: gLogger.notice("Query limited to %s entries" % switchDict['limit']) res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict['limit'])) else: res = client.getCacheReplicas(queryDict) if not res['OK']: gLogger.error(res['Message']) outStr = "\n" if res['Records']: replicas = res['Value'] outStr += " %s" % ("Status".ljust(15)) outStr += " %s" % ("LastUpdate".ljust(20)) outStr += " %s" % ("LFN".ljust(80)) outStr += " %s" % ("SE".ljust(10)) outStr += " %s" % ("Reason".ljust(10)) if 'showJobs' in switchDict: outStr += " %s" % ("Jobs".ljust(10)) outStr += " %s" % ("PinExpiryTime".ljust(15)) outStr += " %s" % ("PinLength(sec)".ljust(15)) outStr += "\n" for crid, info in replicas.iteritems(): outStr += " %s" % (info['Status'].ljust(15)) outStr += " %s" % (str(info['LastUpdate']).ljust(20)) outStr += " %s" % (info['LFN'].ljust(30)) outStr += " %s" % (info['SE'].ljust(15)) outStr += " %s" % (str(info['Reason']).ljust(10)) # Task info if 'showJobs' in switchDict: resTasks = client.getTasks({'ReplicaID': crid}) if resTasks['OK']: if resTasks['Value']: tasks = resTasks['Value'] jobs = [] for tid in tasks: jobs.append(tasks[tid]['SourceTaskID']) outStr += ' %s ' % (str(jobs).ljust(10)) else: outStr += ' %s ' % (" --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({'ReplicaID': crid}) if not resStageRequests['OK']: gLogger.error(resStageRequests['Message']) if resStageRequests['Records']: stageRequests = resStageRequests['Value'] for info in stageRequests.itervalues(): outStr += " %s" % (str(info['PinExpiryTime']).ljust(20)) outStr += " %s" % (str(info['PinLength']).ljust(10)) outStr += "\n" gLogger.notice(outStr) else: gLogger.notice("No entries")
outStr = "%s\n%s: %s" % ( outStr, 'SE'.ljust( 8 ), cacheReplicaInfo[replicaID]['SE'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'PFN'.ljust( 8 ), cacheReplicaInfo[replicaID]['PFN'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'Status'.ljust( 8 ), cacheReplicaInfo[replicaID]['Status'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'LastUpdate'.ljust( 8 ), str(cacheReplicaInfo[replicaID]['LastUpdate']).ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'Reason'.ljust( 8 ), str( cacheReplicaInfo[replicaID]['Reason']).ljust( 100 ) ) resTasks = client.getTasks({'ReplicaID':replicaID}) if resTasks['OK']: #print resTasks['Message'] outStr = '%s\nJob IDs requesting this file to be staged:'.ljust( 8) % outStr tasks = resTasks['Value'] for tid in tasks.keys(): outStr = '%s %s ' % (outStr, tasks[tid]['SourceTaskID']) resStageRequests = client.getStageRequests({'ReplicaID':replicaID}) if not resStageRequests['OK']: print resStageRequests['Message'] if resStageRequests['Records']: stageRequests = resStageRequests['Value'] outStr = "%s\n------SRM staging request info--------------" % outStr for srid in stageRequests.keys(): outStr = "%s\n%s: %s" % ( outStr, 'SRM RequestID'.ljust( 8 ), stageRequests[srid]['RequestID'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'SRM StageStatus'.ljust( 8 ), stageRequests[srid]['StageStatus'].ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'SRM StageRequestSubmitTime'.ljust( 8 ), str(stageRequests[srid]['StageRequestSubmitTime']).ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'SRM StageRequestCompletedTime'.ljust( 8 ), str(stageRequests[srid]['StageRequestCompletedTime']).ljust( 100 ) ) outStr = "%s\n%s: %s" % ( outStr, 'SRM PinExpiryTime'.ljust( 8 ), str(stageRequests[srid]['PinExpiryTime']).ljust( 100 ) ) outStr = "%s\n%s: %s sec" % ( outStr, 'SRM PinLength'.ljust( 8 ), str(stageRequests[srid]['PinLength']).ljust( 100 ) ) else:
outStr = "%s\n%s: %s" % (outStr, "SE".ljust(8), cacheReplicaInfo[replicaID]["SE"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "PFN".ljust(8), cacheReplicaInfo[replicaID]["PFN"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "Status".ljust(8), cacheReplicaInfo[replicaID]["Status"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "LastUpdate".ljust(8), str(cacheReplicaInfo[replicaID]["LastUpdate"]).ljust(100)) outStr = "%s\n%s: %s" % (outStr, "Reason".ljust(8), str(cacheReplicaInfo[replicaID]["Reason"]).ljust(100)) resTasks = client.getTasks({"ReplicaID": replicaID}) if resTasks["OK"]: # print resTasks['Message'] outStr = "%s\nJob IDs requesting this file to be staged:".ljust(8) % outStr tasks = resTasks["Value"] for tid in tasks.keys(): outStr = "%s %s " % (outStr, tasks[tid]["SourceTaskID"]) resStageRequests = client.getStageRequests({"ReplicaID": replicaID}) if not resStageRequests["OK"]: print resStageRequests["Message"] if resStageRequests["Records"]: stageRequests = resStageRequests["Value"] outStr = "%s\n------SRM staging request info--------------" % outStr for srid in stageRequests.keys(): outStr = "%s\n%s: %s" % (outStr, "SRM RequestID".ljust(8), stageRequests[srid]["RequestID"].ljust(100)) outStr = "%s\n%s: %s" % (outStr, "SRM StageStatus".ljust(8), stageRequests[srid]["StageStatus"].ljust(100)) outStr = "%s\n%s: %s" % ( outStr, "SRM StageRequestSubmitTime".ljust(8), str(stageRequests[srid]["StageRequestSubmitTime"]).ljust(100), )
def run(): from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} dictKeys = switchDict.keys() if 'status' in dictKeys: queryDict['Status'] = str(switchDict['status']) if 'se' in dictKeys: queryDict['SE'] = str(switchDict['se']); # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = '1903-08-02 06:24:38' # select newer than if 'limit' in dictKeys: print "Query limited to %s entries" %switchDict['limit'] res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict['limit'])) else: res = client.getCacheReplicas(queryDict) if not res['OK']: print res['Message'] outStr ="\n" if res['Records']: replicas = res['Value'] outStr = "%s %s" %(outStr, "Status".ljust(15)) outStr = "%s %s" %(outStr, "LastUpdate".ljust(20)) outStr = "%s %s" %(outStr, "LFN".ljust(80)) outStr = "%s %s" %(outStr, "SE".ljust(10)) outStr = "%s %s" %(outStr, "Reason".ljust(10)) if 'showJobs' in dictKeys: outStr = "%s %s" %(outStr, "Jobs".ljust(10)) outStr = "%s %s" %(outStr, "PinExpiryTime".ljust(15)) outStr = "%s %s" %(outStr, "PinLength(sec)".ljust(15)) outStr = "%s\n" % outStr for crid in replicas.keys(): outStr = "%s %s" %(outStr, replicas[crid]['Status'].ljust( 15 )) outStr = "%s %s" %(outStr, str(replicas[crid]['LastUpdate']).ljust( 20 )) outStr = "%s %s" %(outStr, replicas[crid]['LFN'].ljust( 30 )) outStr = "%s %s" %(outStr, replicas[crid]['SE'].ljust( 15 )) outStr = "%s %s" %(outStr, str(replicas[crid]['Reason']).ljust( 10 )) # Task info if 'showJobs' in dictKeys: resTasks = client.getTasks({'ReplicaID':crid}) if resTasks['OK']: if resTasks['Value']: tasks = resTasks['Value'] jobs = [] for tid in tasks.keys(): jobs.append(tasks[tid]['SourceTaskID']) outStr = '%s %s ' % (outStr, str(jobs).ljust(10)) else: outStr = '%s %s ' % (outStr, " --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({'ReplicaID':crid}) if not resStageRequests['OK']: print resStageRequests['Message'] if resStageRequests['Records']: stageRequests = resStageRequests['Value'] for srid in stageRequests.keys(): outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinExpiryTime']).ljust( 20 )) outStr = "%s %s" %(outStr, str(stageRequests[srid]['PinLength']).ljust( 10 )) outStr = "%s\n" % outStr print outStr else: print "No entries"
def run(): global subLogger from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() queryDict = {} if "status" in switchDict: queryDict["Status"] = str(switchDict["status"]) if "se" in switchDict: queryDict["SE"] = str(switchDict["se"]) # weird: if there are no switches (dictionary is empty), then the --limit is ignored!! # must FIX that in StorageManagementDB.py! # ugly fix: newer = "1903-08-02 06:24:38" # select newer than if "limit" in switchDict: gLogger.notice("Query limited to %s entries" % switchDict["limit"]) res = client.getCacheReplicas(queryDict, None, newer, None, None, int(switchDict["limit"])) else: res = client.getCacheReplicas(queryDict) if not res["OK"]: gLogger.error(res["Message"]) outStr = "\n" if res["Records"]: replicas = res["Value"] outStr += " %s" % ("Status".ljust(15)) outStr += " %s" % ("LastUpdate".ljust(20)) outStr += " %s" % ("LFN".ljust(80)) outStr += " %s" % ("SE".ljust(10)) outStr += " %s" % ("Reason".ljust(10)) if "showJobs" in switchDict: outStr += " %s" % ("Jobs".ljust(10)) outStr += " %s" % ("PinExpiryTime".ljust(15)) outStr += " %s" % ("PinLength(sec)".ljust(15)) outStr += "\n" for crid, info in replicas.items(): outStr += " %s" % (info["Status"].ljust(15)) outStr += " %s" % (str(info["LastUpdate"]).ljust(20)) outStr += " %s" % (info["LFN"].ljust(30)) outStr += " %s" % (info["SE"].ljust(15)) outStr += " %s" % (str(info["Reason"]).ljust(10)) # Task info if "showJobs" in switchDict: resTasks = client.getTasks({"ReplicaID": crid}) if resTasks["OK"]: if resTasks["Value"]: tasks = resTasks["Value"] jobs = [] for tid in tasks: jobs.append(tasks[tid]["SourceTaskID"]) outStr += " %s " % (str(jobs).ljust(10)) else: outStr += " %s " % (" --- ".ljust(10)) # Stage request info # what if there's no request to the site yet? resStageRequests = client.getStageRequests({"ReplicaID": crid}) if not resStageRequests["OK"]: gLogger.error(resStageRequests["Message"]) if resStageRequests["Records"]: stageRequests = resStageRequests["Value"] for info in stageRequests.values(): outStr += " %s" % (str( info["PinExpiryTime"]).ljust(20)) outStr += " %s" % (str(info["PinLength"]).ljust(10)) outStr += "\n" gLogger.notice(outStr) else: gLogger.notice("No entries")
class StageMonitorAgent(AgentModule): def initialize(self): self.stagerClient = StorageManagerClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "DataManager") self.storagePlugins = self.am_getOption("StoragePlugins", []) self.dataOpSender = DataOperationSender() return S_OK() def execute(self): res = getProxyInfo(disableVOMS=True) if not res["OK"]: return res self.proxyInfoDict = res["Value"] return self.monitorStageRequests() def monitorStageRequests(self): """This is the third logical task manages the StageSubmitted->Staged transition of the Replicas""" res = self.__getStageSubmittedReplicas() if not res["OK"]: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res["Message"]) return res if not res["Value"]: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res["Value"]["SEReplicas"] replicaIDs = res["Value"]["ReplicaIDs"] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len(replicaIDs)) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests(storageElement, seReplicaIDs, replicaIDs) return self.dataOpSender.concludeSending() def __monitorStorageElementStageRequests(self, storageElement, seReplicaIDs, replicaIDs): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] # Since we are in a given SE, the LFN is a unique key lfnRepIDs = {} for replicaID in seReplicaIDs: lfn = replicaIDs[replicaID]["LFN"] lfnRepIDs[lfn] = replicaID if lfnRepIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % (len(lfnRepIDs), storageElement)) else: gLogger.warn( "StageMonitor.__monitorStorageElementStageRequests: No requests to monitor for %s." % storageElement) return startTime = datetime.datetime.utcnow() res = StorageElement( storageElement, plugins=self.storagePlugins).getFileMetadata(lfnRepIDs) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas", res["Message"], ) return prestageStatus = res["Value"] accountingDict = self.__newAccountingDict(storageElement) for lfn, reason in prestageStatus["Failed"].items(): accountingDict["TransferTotal"] += 1 if re.search("File does not exist", reason): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: LFN did not exist in the StorageElement", lfn) terminalReplicaIDs[ lfnRepIDs[lfn]] = "LFN did not exist in the StorageElement" for lfn, metadata in prestageStatus["Successful"].items(): if not metadata: continue staged = metadata.get("Cached", metadata["Accessible"]) if staged: accountingDict["TransferTotal"] += 1 accountingDict["TransferOK"] += 1 accountingDict["TransferSize"] += metadata["Size"] stagedReplicas.append(lfnRepIDs[lfn]) elif staged is not None: oldRequests.append(lfnRepIDs[lfn]) # only ReplicaIDs # Check if sending data operation to Monitoring self.dataOpSender.sendData(accountingDict, startTime=startTime, endTime=datetime.datetime.utcnow()) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res["Message"], ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len(stagedReplicas)) res = self.stagerClient.setStageComplete(stagedReplicas) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res["Message"], ) res = self.stagerClient.updateReplicaStatus( stagedReplicas, "Staged") if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res["Message"], ) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len(oldRequests)) res = self.__wakeupOldRequests(oldRequests) if not res["OK"]: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res["Message"]) return def __newAccountingDict(self, storageElement): """Generate a new accounting Dict""" accountingDict = {} accountingDict["OperationType"] = "Stage" accountingDict["User"] = self.proxyInfoDict["username"] accountingDict["Protocol"] = "Stager" accountingDict["RegistrationTime"] = 0.0 accountingDict["RegistrationOK"] = 0 accountingDict["RegistrationTotal"] = 0 accountingDict["FinalStatus"] = "Successful" accountingDict["Source"] = storageElement accountingDict["Destination"] = storageElement accountingDict["ExecutionSite"] = siteName() accountingDict["TransferTotal"] = 0 accountingDict["TransferOK"] = 0 accountingDict["TransferSize"] = 0 accountingDict["TransferTime"] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas(self): """This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas({"Status": "StageSubmitted"}) if not res["OK"]: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res["Message"], ) return res if not res["Value"]: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len(res["Value"])) seReplicas = {} replicaIDs = res["Value"] for replicaID, info in replicaIDs.items(): storageElement = info["SE"] seReplicas.setdefault(storageElement, []).append(replicaID) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {"ReplicaID": list(replicaIDs)}) if not res["OK"]: return res if not res["Value"]: return S_ERROR( "Could not obtain request IDs for replicas %s from StageRequests table" % list(replicaIDs)) for replicaID, info in res["Value"].items(): replicaIDs[replicaID]["RequestID"] = info["RequestID"] return S_OK({"SEReplicas": seReplicas, "ReplicaIDs": replicaIDs}) def __wakeupOldRequests(self, oldRequests): gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...") retryInterval = self.am_getOption("RetryIntervalHour", 2) res = self.stagerClient.wakeupOldRequests(oldRequests, retryInterval) if not res["OK"]: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res["Message"]) return res return S_OK()
class StageMonitorAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() #self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): res = getProxyInfo( disableVOMS = True ) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests( self ): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) ) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs ) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] pfnRepIDs = {} pfnReqIDs = {} for replicaID in seReplicaIDs: pfn = replicaIDs[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID requestID = replicaIDs[replicaID].get( 'RequestID', None ) if requestID: pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) oAccounting = DataOperation() oAccounting.setStartTime() res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] ) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict( storageElement ) for pfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search( 'File does not exist', reason ): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement' for pfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append( pfnRepIDs[pfn] ) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append( pfnRepIDs[pfn] ); #only ReplicaIDs oAccounting.setValuesFromDict( accountingDict ) oAccounting.setEndTime() gDataStoreClient.addRegister( oAccounting ) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) ) res = self.stagerClient.setStageComplete( stagedReplicas ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] ) res = self.stagerClient.updateReplicaStatus( stagedReplicas, 'Staged' ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] ) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len( oldRequests ) ) res = self.__wakeupOldRequests( oldRequests ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message'] ) return def __newAccountingDict( self, storageElement ): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas( self ): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.stagerClient.getCacheReplicas( {'Status':'StageSubmitted'} ) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) ) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.stagerClient.getStageRequests( {'ReplicaID':replicaIDs.keys()} ) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) ) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'StageMonitorAgent' ) if not res['OK']: gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res def __wakeupOldRequests( self, oldRequests ): gLogger.info( "StageMonitor.__wakeupOldRequests: Attempting..." ) retryInterval = self.am_getOption( 'RetryIntervalHour', 2 ) res = self.stagerClient.wakeupOldRequests( oldRequests, retryInterval ) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message'] ) return res return S_OK()
def main(): # Registering arguments will automatically add their description to the help menu Script.registerArgument("LFN: LFN of the staging file") Script.registerArgument("SE: Storage Element for the staging file") Script.parseCommandLine(ignoreErrors=True) from DIRAC import exit as DIRACExit, gLogger lfn, se = Script.getPositionalArgs(group=True) from DIRAC.StorageManagementSystem.Client.StorageManagerClient import StorageManagerClient client = StorageManagerClient() res = client.getCacheReplicas({"LFN": lfn, "SE": se}) if not res["OK"]: gLogger.error(res["Message"]) cacheReplicaInfo = res["Value"] if cacheReplicaInfo: replicaID = list(cacheReplicaInfo)[0] outStr = "\n--------------------" outStr += "\n%s: %s" % ("LFN".ljust(8), cacheReplicaInfo[replicaID]["LFN"].ljust(100)) outStr += "\n%s: %s" % ("SE".ljust(8), cacheReplicaInfo[replicaID]["SE"].ljust(100)) outStr += "\n%s: %s" % ("PFN".ljust(8), cacheReplicaInfo[replicaID]["PFN"].ljust(100)) outStr += "\n%s: %s" % ("Status".ljust(8), cacheReplicaInfo[replicaID]["Status"].ljust(100)) outStr += "\n%s: %s" % ("LastUpdate".ljust(8), str(cacheReplicaInfo[replicaID]["LastUpdate"]).ljust(100)) outStr += "\n%s: %s" % ("Reason".ljust(8), str(cacheReplicaInfo[replicaID]["Reason"]).ljust(100)) resTasks = client.getTasks({"ReplicaID": replicaID}) if resTasks["OK"]: # print resTasks['Message'] outStr += "\nJob IDs requesting this file to be staged:".ljust(8) tasks = resTasks["Value"] for tid in tasks.keys(): outStr += " %s " % (tasks[tid]["SourceTaskID"]) resStageRequests = client.getStageRequests({"ReplicaID": replicaID}) if not resStageRequests["OK"]: gLogger.error(resStageRequests["Message"]) if resStageRequests["Records"]: stageRequests = resStageRequests["Value"] outStr += "\n------SRM staging request info--------------" for info in stageRequests.values(): outStr += "\n%s: %s" % ("SRM RequestID".ljust(8), info["RequestID"].ljust(100)) outStr += "\n%s: %s" % ("SRM StageStatus".ljust(8), info["StageStatus"].ljust(100)) outStr += "\n%s: %s" % ( "SRM StageRequestSubmitTime".ljust(8), str(info["StageRequestSubmitTime"]).ljust(100), ) outStr += "\n%s: %s" % ( "SRM StageRequestCompletedTime".ljust(8), str(info["StageRequestCompletedTime"]).ljust(100), ) outStr += "\n%s: %s" % ("SRM PinExpiryTime".ljust(8), str(info["PinExpiryTime"]).ljust(100)) outStr += "\n%s: %s sec" % ("SRM PinLength".ljust(8), str(info["PinLength"]).ljust(100)) else: outStr += "\nThere are no staging requests submitted to the site yet.".ljust(8) else: outStr = "\nThere is no such file requested for staging. Check for typo's!" # Script.showHelp() gLogger.notice(outStr) DIRACExit(0)