def main(): global uuid global jobid Script.registerSwitch('u:', 'uuid=', 'get PilotsLogging for given Pilot UUID', setUUID) Script.registerSwitch('j:', 'jobid=', 'get PilotsLogging for given Job ID', setJobID) Script.parseCommandLine() from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient if jobid: result = PilotManagerClient().getPilots(jobid) if not result['OK']: gLogger.error(result['Message']) DIRAC.exit(1) gLogger.debug(result['Value']) uuid = list(result['Value'])[0] result = PilotManagerClient().getPilotLoggingInfo(uuid) if not result['OK']: gLogger.error(result['Message']) DIRAC.exit(1) gLogger.notice(result['Value']) DIRAC.exit(0)
def main(): """reads in the options and deletes the matching pilots""" options = Params() options.registerCLISwitches() Script.parseCommandLine(ignoreErrors=True) # make sure *something* is set if not options.site and not options.ce and not options.vo and not options.status: print( "You must chose at least one of the following options: --vo, --ce --site" ) # occasionally the same job might appear twice, but that shouldn't matter conditions = {} if options.status: conditions["Status"] = options.status[0] else: conditions["Status"] = ["Submitted", "Scheduled", "Waiting", "Unknown"] if options.site: conditions["GridSite"] = options.site if options.ce: conditions["DestinationSite"] = options.ce if options.vo: pilotstring = options.vo + "_pilot" conditions["OwnerGroup"] = pilotstring # conditions = {"Status":"Submitted", "GridSite":"LCG.UKI-LT2-IC-HEP.uk", # "OwnerGroup":["lz_pilot", "gridpp_pilot"], "DestinationSite":"ceprod00.grid.hep.ph.ic.ac.uk"} print("Selecting pilots fulfulling the following conditions: %s" % conditions) pilotmanager = PilotManagerClient() result = pilotmanager.selectPilots(conditions) if not result['Value']: print("No pilots matching these criteria were found.") sys.exit(0) print("Found the following matching pilots:") for pilotRef in result['Value']: print(pilotRef) if options.dryrun: print("Dry run only. No pilots will be deleted") sys.exit(0) # now get the pilot references and delete them from DIRAC.Interfaces.API.DiracAdmin import DiracAdmin diracAdmin = DiracAdmin() for pilotRef in result['Value']: result = diracAdmin.killPilot(pilotRef) if not result['OK']: print("Error encountered when deleting pilot %s" % pilotRef) print(result)
def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'Pilots' in self.apis: self.pilots = self.apis['Pilots'] else: self.pilots = PilotManagerClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient()
def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if "Pilots" in self.apis: self.pilots = self.apis["Pilots"] else: self.pilots = PilotManagerClient() if "ResourceManagementClient" in self.apis: self.rmClient = self.apis["ResourceManagementClient"] else: self.rmClient = ResourceManagementClient()
def initialize(self): """Sets defaults""" self.am_setOption("GridEnv", "") self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30) self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay", 7) self.pilots = PilotManagerClient() return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.pilots = PilotManagerClient() return S_OK()
def initializeHandler(cls, svcInfoDict): """WMS AdministratorService initialization""" try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.JobDB", "JobDB") if not result["OK"]: return result cls.jobDB = result["Value"](parentLogger=cls.log) except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp!r}") cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( "/Services/JobMonitoring/useESForJobParametersFlag", False) if useESForJobParametersFlag: try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.ElasticJobParametersDB", "ElasticJobParametersDB") if not result["OK"]: return result cls.elasticJobParametersDB = result["Value"]() except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp!r}") cls.pilotManager = PilotManagerClient() return S_OK()
def initialize(self): """ Define the commands to be executed, and instantiate the clients that will be used. """ res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceStatusClient', 'ResourceStatusClient') if not res['OK']: self.log.error('Failed to load ResourceStatusClient class: %s' % res['Message']) return res rsClass = res['Value'] res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceManagementClient', 'ResourceManagementClient') if not res['OK']: self.log.error('Failed to load ResourceManagementClient class: %s' % res['Message']) return res rmClass = res['Value'] self.commands['Downtime'] = [{'Downtime': {}}] self.commands['GOCDBSync'] = [{'GOCDBSync': {}}] self.commands['FreeDiskSpace'] = [{'FreeDiskSpace': {}}] # PilotsCommand # self.commands[ 'Pilots' ] = [ # { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } }, # { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } } # ] # FIXME: do not forget about hourly vs Always ...etc # AccountingCacheCommand # self.commands[ 'AccountingCache' ] = [ # {'SuccessfullJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'FailedJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'SuccessfullPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'SuccessfullPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'RunningJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :168, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :720, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :8760, 'plotType' :'Job' }}, # ] # VOBOXAvailability # self.commands[ 'VOBOXAvailability' ] = [ # { 'VOBOXAvailability' : {} } # # Reuse clients for the commands self.clients['GOCDBClient'] = GOCDBClient() self.clients['ReportsClient'] = ReportsClient() self.clients['ResourceStatusClient'] = rsClass() self.clients['ResourceManagementClient'] = rmClass() self.clients['WMSAdministrator'] = WMSAdministratorClient() self.clients['Pilots'] = PilotManagerClient() self.cCaller = CommandCaller return S_OK()
def getPilotOutput(self, gridReference, directory=""): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, str): return self._errorReport("Expected string for pilot reference") if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport("Directory %s does not exist" % directory) result = PilotManagerClient().getPilotOutput(gridReference) if not result["OK"]: return result gridReferenceSmall = gridReference.split("/")[-1] if not gridReferenceSmall: gridReferenceSmall = "reference" outputPath = "%s/pilot_%s" % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info("Remove %s and retry to continue" % outputPath) return S_ERROR("Remove %s and retry to continue" % outputPath) if not os.path.exists(outputPath): self.log.verbose("Creating directory %s" % outputPath) os.mkdir(outputPath) outputs = result["Value"] if "StdOut" in outputs: stdout = "%s/std.out" % (outputPath) with open(stdout, "w") as fopen: fopen.write(outputs["StdOut"]) self.log.info("Standard output written to %s" % (stdout)) else: self.log.warn("No standard output returned") if "StdErr" in outputs: stderr = "%s/std.err" % (outputPath) with open(stderr, "w") as fopen: fopen.write(outputs["StdErr"]) self.log.info("Standard error written to %s" % (stderr)) else: self.log.warn("No standard error returned") self.log.always("Outputs retrieved in %s" % outputPath) return result
def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = PilotManagerClient().getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result
def killPilot(self, gridReference): """Kill the pilot specified >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().killPilot(gridReference) return result
def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> gLogger.notice(dirac.getPilotLoggingInfo(12345)) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') return PilotManagerClient().getPilotLoggingInfo(gridReference)
def finalize(self): """ Job Agent finalization method """ gridCE = gConfig.getValue('/LocalSite/GridCE', '') queue = gConfig.getValue('/LocalSite/CEQueue', '') result = PilotManagerClient().setPilotStatus(str(self.pilotReference), 'Done', gridCE, 'Report from JobAgent', self.siteName, queue) if not result['OK']: self.log.warn('Issue setting the pilot status', result['Message']) return S_OK()
def main(): global uuid global jobid Script.registerSwitch("u:", "uuid=", "get PilotsLogging for given Pilot UUID", setUUID) Script.registerSwitch("j:", "jobid=", "get PilotsLogging for given Job ID", setJobID) Script.parseCommandLine() from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient if jobid: result = PilotManagerClient().getPilots(jobid) if not result["OK"]: gLogger.error(result["Message"]) DIRAC.exit(1) gLogger.debug(result["Value"]) uuid = list(result["Value"])[0] result = PilotManagerClient().getPilotLoggingInfo(uuid) if not result["OK"]: gLogger.error(result["Message"]) DIRAC.exit(1) gLogger.notice(result["Value"]) DIRAC.exit(0)
def getCEStatus(self): """Method to return information on running and pending jobs. Warning: information currently returned depends on the PilotManager and not HTCondor. Results might be wrong if pilots or jobs are submitted manually via the CE. """ result = S_OK() result["SubmittedJobs"] = 0 result["RunningJobs"] = 0 result["WaitingJobs"] = 0 # getWaitingPilots condDict = { "DestinationSite": self.ceName, "Status": PilotStatus.PILOT_WAITING_STATES } res = PilotManagerClient().countPilots(condDict) if res["OK"]: result["WaitingJobs"] = int(res["Value"]) else: self.log.warn("Failure getting pilot count for %s: %s " % (self.ceName, res["Message"])) # getRunningPilots condDict = { "DestinationSite": self.ceName, "Status": PilotStatus.RUNNING } res = PilotManagerClient().countPilots(condDict) if res["OK"]: result["RunningJobs"] = int(res["Value"]) else: self.log.warn("Failure getting pilot count for %s: %s " % (self.ceName, res["Message"])) return result
def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().getPilotInfo(gridReference) return result
def finalize(self): """Job Agent finalization method""" # wait for all jobs to be completed res = self.computingElement.shutdown() if not res["OK"]: self.log.error("CE could not be properly shut down", res["Message"]) elif res["Value"]: self.log.info("Job submission(s) result", res["Value"]) gridCE = gConfig.getValue("/LocalSite/GridCE", "") queue = gConfig.getValue("/LocalSite/CEQueue", "") result = PilotManagerClient().setPilotStatus(str(self.pilotReference), PilotStatus.DONE, gridCE, "Report from JobAgent", self.siteName, queue) if not result["OK"]: self.log.warn("Issue setting the pilot status", result["Message"]) return S_OK()
def initializeHandler(cls, svcInfoDict): """initialize DBs""" try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.JobDB", "JobDB") if not result["OK"]: return result cls.jobDB = result["Value"]() result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB") if not result["OK"]: return result cls.jobLoggingDB = result["Value"]() result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB") if not result["OK"]: return result cls.taskQueueDB = result["Value"]() except RuntimeError as excp: return S_ERROR("Can't connect to DB: %s" % excp) cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( "/Services/JobMonitoring/useESForJobParametersFlag", False) if useESForJobParametersFlag: try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.ElasticJobParametersDB", "ElasticJobParametersDB") if not result["OK"]: return result cls.elasticJobParametersDB = result["Value"]() except RuntimeError as excp: return S_ERROR("Can't connect to DB: %s" % excp) cls.pilotManager = PilotManagerClient() return S_OK()
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference', 'Unknown') if pilotReference == 'Unknown': # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message'])) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: %s' % (jobID, result['Message'])) return result pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> gLogger.notice(dirac.getJobPilots()) {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport(str(x), 'Expected integer or string for existing jobID') result = PilotManagerClient().getPilots(jobID) if result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
def _getJobPilotStatus(self, jobID): """Get the job pilot status""" result = JobMonitoringClient().getJobParameter(jobID, "Pilot_Reference") if not result["OK"]: return result pilotReference = result["Value"].get("Pilot_Reference", "Unknown") if pilotReference == "Unknown": # There is no pilot reference, hence its status is unknown return S_OK("NoPilot") result = PilotManagerClient().getPilotInfo(pilotReference) if not result["OK"]: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result["Message"])) return S_OK("NoPilot") self.log.error("Failed to get pilot information", "for job %d: %s" % (jobID, result["Message"])) return result pilotStatus = result["Value"][pilotReference]["Status"] return S_OK(pilotStatus)
def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> gLogger.notice(dirac.getPilotSummary()) {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ result = PilotManagerClient().getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) gLogger.notice(headers) for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) gLogger.notice(line) return result
class PilotStatusAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """c'tor""" super().__init__(*args, **kwargs) self.jobDB = None self.pilotDB = None self.diracadmin = None ############################################################################# def initialize(self): """Sets defaults""" self.am_setOption("GridEnv", "") self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30) self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay", 7) self.pilots = PilotManagerClient() return S_OK() ############################################################################# def execute(self): """The PilotAgent execution method.""" self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3) self.gridEnv = self.am_getOption("GridEnv") if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue("/DIRAC/Setup", "") if setup: instance = gConfig.getValue( "/DIRAC/Setups/%s/WorkloadManagement" % setup, "") if instance: self.gridEnv = gConfig.getValue( "/Systems/WorkloadManagement/%s/GridEnv" % instance, "") result = self.pilotDB._getConnection() if not result["OK"]: return result connection = result["Value"] # Now handle pilots not updated in the last N days and declare them Deleted. result = self.handleOldPilots(connection) connection.close() result = self.pilots.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay) if not result["OK"]: self.log.warn("Failed to clear old pilots in the PilotAgentsDB") return S_OK() def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = TimeUtilities.toString( datetime.datetime.utcnow() - TimeUtilities.day * self.pilotStalledDays) result = self.pilotDB.selectPilots( {"Status": PilotStatus.PILOT_TRANSIENT_STATES}, older=timeLimitToConsider, timeStamp="LastUpdateTime") if not result["OK"]: self.log.error("Failed to get the Pilot Agents") return result if not result["Value"]: return S_OK() refList = result["Value"] result = self.pilotDB.getPilotInfo(refList) if not result["OK"]: self.log.error("Failed to get Info for Pilot Agents") return result pilotsDict = result["Value"] for pRef in pilotsDict: if pilotsDict[pRef].get("Jobs") and self._checkJobLastUpdateTime( pilotsDict[pRef]["Jobs"], self.pilotStalledDays): self.log.debug( "%s should not be deleted since one job of %s is running." % (str(pRef), str(pilotsDict[pRef]["Jobs"]))) continue deletedJobDict = pilotsDict[pRef] deletedJobDict["Status"] = PilotStatus.DELETED deletedJobDict["StatusDate"] = datetime.datetime.utcnow() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) return S_OK() def accountPilots(self, pilotsToAccount, connection): """account for pilots""" accountingFlag = False pae = self.am_getOption("PilotAccountingEnabled", "yes") if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info("No pilots to Account") return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo(list(pilotsToAccount), conn=connection) if not retVal["OK"]: self.log.error("Fail to retrieve Info for pilots", retVal["Message"]) return retVal dbData = retVal["Value"] for pref in dbData: if pref in pilotsToAccount: if dbData[pref][ "Status"] not in PilotStatus.PILOT_FINAL_STATES: dbData[pref]["Status"] = pilotsToAccount[pref][ "Status"] dbData[pref]["DestinationSite"] = pilotsToAccount[ pref]["DestinationSite"] dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref][ "StatusDate"] retVal = self._addPilotsAccountingReport(dbData) if not retVal["OK"]: self.log.error("Fail to retrieve Info for pilots", retVal["Message"]) return retVal self.log.info("Sending accounting records...") retVal = gDataStoreClient.commit() if not retVal["OK"]: self.log.error("Can't send accounting reports", retVal["Message"]) else: self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount)) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"])) self.pilotDB.setPilotStatus(pRef, pDict["Status"], pDict["DestinationSite"], pDict["StatusDate"], conn=connection) return S_OK() def _addPilotsAccountingReport(self, pilotsData): """fill accounting data""" for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime(pData["LastUpdateTime"]) pA.setStartTime(pData["SubmissionTime"]) retVal = Registry.getUsernameForDN(pData["OwnerDN"]) if not retVal["OK"]: userName = "******" self.log.error( "Can't determine username for dn", ": %s : %s" % (pData["OwnerDN"], retVal["Message"]), ) else: userName = retVal["Value"] pA.setValueByKey("User", userName) pA.setValueByKey("UserGroup", pData["OwnerGroup"]) result = getCESiteMapping(pData["DestinationSite"]) if result["OK"] and pData["DestinationSite"] in result["Value"]: pA.setValueByKey( "Site", result["Value"][pData["DestinationSite"]].strip()) else: pA.setValueByKey("Site", "Unknown") pA.setValueByKey("GridCE", pData["DestinationSite"]) pA.setValueByKey("GridMiddleware", pData["GridType"]) pA.setValueByKey("GridResourceBroker", pData["Broker"]) pA.setValueByKey("GridStatus", pData["Status"]) if "Jobs" not in pData: pA.setValueByKey("Jobs", 0) else: pA.setValueByKey("Jobs", len(pData["Jobs"])) self.log.verbose("Added accounting record for pilot %s" % pData["PilotID"]) retVal = gDataStoreClient.addRegister(pA) if not retVal["OK"]: return retVal return S_OK() def _killPilots(self, acc): for i in sorted(acc.keys()): result = self.diracadmin.getPilotInfo(i) if result["OK"] and i in result["Value"] and "Status" in result[ "Value"][i]: ret = self.diracadmin.killPilot(str(i)) if ret["OK"]: self.log.info( "Successfully deleted", ": %s (Status : %s)" % (i, result["Value"][i]["Status"])) else: self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret["Message"])) else: self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result))) def _checkJobLastUpdateTime(self, joblist, StalledDays): timeLimitToConsider = datetime.datetime.utcnow( ) - TimeUtilities.day * StalledDays ret = False for jobID in joblist: result = self.jobDB.getJobAttributes(int(jobID)) if result["OK"]: if "LastUpdateTime" in result["Value"]: lastUpdateTime = result["Value"]["LastUpdateTime"] if TimeUtilities.fromString( lastUpdateTime) > timeLimitToConsider: ret = True self.log.debug( "Since %s updates LastUpdateTime on %s this does not to need to be deleted." % (str(jobID), str(lastUpdateTime))) break else: self.log.error("Error taking job info from DB", result["Message"]) return ret
def test_PilotsDB(): pilots = PilotManagerClient() res = pilots.addPilotTQReference(['aPilot'], 1, '/a/ownerDN', 'a/owner/Group') assert res['OK'] is True, res['Message'] res = pilots.getCurrentPilotCounters({}) assert res['OK'] is True, res['Message'] assert res['Value'] == {'Submitted': 1}, res['Value'] res = pilots.deletePilots('aPilot') assert res['OK'] is True, res['Message'] res = pilots.getCurrentPilotCounters({}) assert res['OK'] is True, res['Message'] assert res['Value'] == {}, res['Value'] res = pilots.addPilotTQReference(['anotherPilot'], 1, '/a/ownerDN', 'a/owner/Group') assert res['OK'] is True, res['Message'] res = pilots.storePilotOutput('anotherPilot', 'This is an output', 'this is an error') assert res['OK'] is True, res['Message'] res = pilots.getPilotOutput('anotherPilot') assert res['OK'] is True, res['Message'] assert res['Value'] == {'OwnerDN': '/a/ownerDN', 'OwnerGroup': 'a/owner/Group', 'StdErr': 'this is an error', 'FileList': [], 'StdOut': 'This is an output'} res = pilots.getPilotInfo('anotherPilot') assert res['OK'] is True, res['Message'] assert res['Value']['anotherPilot']['AccountingSent'] == 'False', res['Value'] assert res['Value']['anotherPilot']['PilotJobReference'] == 'anotherPilot', res['Value'] res = pilots.selectPilots({}) assert res['OK'] is True, res['Message'] res = pilots.getPilotSummary('', '') assert res['OK'] is True, res['Message'] assert res['Value']['Total']['Submitted'] == 1 res = pilots.getPilotMonitorWeb({}, [], 0, 100) assert res['OK'] is True, res['Message'] assert res['Value']['TotalRecords'] == 1 res = pilots.getPilotMonitorSelectors() assert res['OK'] is True, res['Message'] assert res['Value'] == {'GridType': ['DIRAC'], 'OwnerGroup': ['a/owner/Group'], 'DestinationSite': ['NotAssigned'], 'Broker': ['Unknown'], 'Status': ['Submitted'], 'OwnerDN': ['/a/ownerDN'], 'GridSite': ['Unknown'], 'Owner': []}, res['Value'] res = pilots.getPilotSummaryWeb({}, [], 0, 100) assert res['OK'] is True, res['Message'] assert res['Value']['TotalRecords'] == 1, res['Value'] res = pilots.setAccountingFlag('anotherPilot', 'True') assert res['OK'] is True, res['Message'] res = pilots.setPilotStatus('anotherPilot', 'Running') assert res['OK'] is True, res['Message'] res = pilots.getPilotInfo('anotherPilot') assert res['OK'] is True, res['Message'] assert res['Value']['anotherPilot']['AccountingSent'] == 'True', res['Value'] assert res['Value']['anotherPilot']['Status'] == 'Running', res['Value'] res = pilots.setJobForPilot(123, 'anotherPilot') assert res['OK'] is True, res['Message'] res = pilots.setPilotBenchmark('anotherPilot', 12.3) assert res['OK'] is True, res['Message'] res = pilots.countPilots({}) assert res['OK'] is True, res['Message'] # res = pilots.getCounters() # # getPilotStatistics res = pilots.deletePilots('anotherPilot') assert res['OK'] is True, res['Message'] res = pilots.getCurrentPilotCounters({}) assert res['OK'] is True, res['Message'] assert res['Value'] == {}, res['Value']
Script.registerSwitch('u:', 'uuid=', 'get PilotsLogging for given Pilot UUID', setUUID) Script.registerSwitch('j:', 'jobid=', 'get PilotsLogging for given Job ID', setJobID) Script.setUsageMessage('\n'.join([ __doc__.split('\n')[1], 'Usage:', ' %s option value ' % Script.scriptName, 'Only one option (either uuid or jobid) should be used.' ])) Script.parseCommandLine() from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient if jobid: result = PilotManagerClient().getPilots(jobid) if not result['OK']: gLogger.error(result['Message']) DIRAC.exit(1) gLogger.debug(result['Value']) uuid = result['Value'].keys()[0] result = PilotManagerClient().getPilotLoggingInfo(uuid) if not result['OK']: gLogger.error(result['Message']) DIRAC.exit(1) gLogger.notice(result['Value']) DIRAC.exit(0)
class PilotCommand(Command): """ Pilot "master" Command. """ def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'Pilots' in self.apis: self.pilots = self.apis['Pilots'] else: self.pilots = PilotManagerClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for pilotDict in result: resQuery = self.rmClient.addOrModifyPilotCache( pilotDict['Site'], pilotDict['CE'], pilotDict['PilotsPerJob'], pilotDict['PilotJobEff'], pilotDict['Status']) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): """ JobCommand requires one arguments: - name : <str> """ if 'name' not in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] if 'element' not in self.args: return S_ERROR('element is missing') element = self.args['element'] if element not in ['Site', 'Resource']: return S_ERROR('"%s" is not Site nor Resource' % element) return S_OK((element, name)) def doNew(self, masterParams=None): if masterParams is not None: element, name = masterParams else: params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] wmsDict = {} if element == 'Site': wmsDict = {'GridSite': name} elif element == 'Resource': wmsDict = {'ExpandSite': name} else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) pilotsResults = self.pilots.getPilotSummaryWeb(wmsDict, [], 0, 0) if not pilotsResults['OK']: return pilotsResults pilotsResults = pilotsResults['Value'] if 'ParameterNames' not in pilotsResults: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = pilotsResults['ParameterNames'] if 'Records' not in pilotsResults: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = pilotsResults['Records'] uniformResult = [] for record in records: # This returns a dictionary with the following keys: # 'Site', 'CE', 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', # 'Done', 'Aborted', 'Done_Empty', 'Aborted_Hour', 'Total', 'PilotsPerJob', # 'PilotJobEff', 'Status', 'InMask' pilotDict = dict(zip(params, record)) pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob']) pilotDict['PilotJobEff'] = float(pilotDict['PilotJobEff']) uniformResult.append(pilotDict) storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] if element == 'Site': # WMS returns Site entries with CE = 'Multiple' site, ce = name, 'Multiple' elif element == 'Resource': site, ce = None, name else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) result = self.rmClient.selectPilotCache(site, ce) if result['OK']: result = S_OK( [dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): siteNames = getSites() if not siteNames['OK']: return siteNames siteNames = siteNames['Value'] res = getCESiteMapping() if not res['OK']: return res ces = list(res['Value']) pilotResults = self.doNew(('Site', siteNames)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) pilotResults = self.doNew(('Resource', ces)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) return S_OK(self.metrics)
def test_PilotsDB(): pilots = PilotManagerClient() # This will allow you to run the test again if necessary for jobID in ["aPilot", "anotherPilot"]: pilots.deletePilots(jobID) res = pilots.addPilotTQReference(["aPilot"], 1, "/a/ownerDN", "a/owner/Group") assert res["OK"], res["Message"] res = pilots.getCurrentPilotCounters({}) assert res["OK"], res["Message"] assert res["Value"] == {"Submitted": 1} res = pilots.deletePilots("aPilot") assert res["OK"], res["Message"] res = pilots.getCurrentPilotCounters({}) assert res["OK"], res["Message"] assert res["Value"] == {} res = pilots.addPilotTQReference(["anotherPilot"], 1, "/a/ownerDN", "a/owner/Group") assert res["OK"], res["Message"] res = pilots.storePilotOutput("anotherPilot", "This is an output", "this is an error") assert res["OK"], res["Message"] res = pilots.getPilotOutput("anotherPilot") assert res["OK"], res["Message"] assert res["Value"] == { "OwnerDN": "/a/ownerDN", "OwnerGroup": "a/owner/Group", "StdErr": "this is an error", "FileList": [], "StdOut": "This is an output", } res = pilots.getPilotInfo("anotherPilot") assert res["OK"], res["Message"] assert res["Value"]["anotherPilot"]["AccountingSent"] == "False" assert res["Value"]["anotherPilot"]["PilotJobReference"] == "anotherPilot" res = pilots.selectPilots({}) assert res["OK"], res["Message"] res = pilots.getPilotSummary("", "") assert res["OK"], res["Message"] assert res["Value"]["Total"]["Submitted"] == 1 res = pilots.getPilotMonitorWeb({}, [], 0, 100) assert res["OK"], res["Message"] assert res["Value"]["TotalRecords"] == 1 res = pilots.getPilotMonitorSelectors() assert res["OK"], res["Message"] assert res["Value"] == { "GridType": ["DIRAC"], "OwnerGroup": ["a/owner/Group"], "DestinationSite": ["NotAssigned"], "Broker": ["Unknown"], "Status": ["Submitted"], "OwnerDN": ["/a/ownerDN"], "GridSite": ["Unknown"], "Owner": [], } res = pilots.getPilotSummaryWeb({}, [], 0, 100) assert res["OK"], res["Message"] assert res["Value"]["TotalRecords"] == 1 res = pilots.setAccountingFlag("anotherPilot", "True") assert res["OK"], res["Message"] res = pilots.setPilotStatus("anotherPilot", "Running") assert res["OK"], res["Message"] res = pilots.getPilotInfo("anotherPilot") assert res["OK"], res["Message"] assert res["Value"]["anotherPilot"]["AccountingSent"] == "True" assert res["Value"]["anotherPilot"]["Status"] == "Running" res = pilots.setJobForPilot(123, "anotherPilot") assert res["OK"], res["Message"] res = pilots.setPilotBenchmark("anotherPilot", 12.3) assert res["OK"], res["Message"] res = pilots.countPilots({}) assert res["OK"], res["Message"] # res = pilots.getCounters() # # getPilotStatistics res = pilots.deletePilots("anotherPilot") assert res["OK"], res["Message"] res = pilots.getCurrentPilotCounters({}) assert res["OK"], res["Message"] assert res["Value"] == {}
def export_getJobPageSummaryWeb(self, selectDict, sortList, startItem, maxItems, selectJobs=True): """ Get the summary of the job information for a given page in the job monitor in a generic format """ resultDict = {} startDate = selectDict.get('FromDate', None) if startDate: del selectDict['FromDate'] # For backward compatibility if startDate is None: startDate = selectDict.get('LastUpdate', None) if startDate: del selectDict['LastUpdate'] endDate = selectDict.get('ToDate', None) if endDate: del selectDict['ToDate'] # Provide JobID bound to a specific PilotJobReference # There is no reason to have both PilotJobReference and JobID in selectDict # If that occurs, use the JobID instead of the PilotJobReference pilotJobRefs = selectDict.get('PilotJobReference') if pilotJobRefs: del selectDict['PilotJobReference'] if 'JobID' not in selectDict or not selectDict['JobID']: if not isinstance(pilotJobRefs, list): pilotJobRefs = [pilotJobRefs] selectDict['JobID'] = [] for pilotJobRef in pilotJobRefs: res = PilotManagerClient().getPilotInfo(pilotJobRef) if res['OK'] and 'Jobs' in res['Value'][pilotJobRef]: selectDict['JobID'].extend(res['Value'][pilotJobRef]['Jobs']) result = self.jobPolicy.getControlledUsers(RIGHT_GET_INFO) if not result['OK']: return S_ERROR('Failed to evaluate user rights') if result['Value'] != 'ALL': selectDict[('Owner', 'OwnerGroup')] = result['Value'] # Sorting instructions. Only one for the moment. if sortList: orderAttribute = sortList[0][0] + ":" + sortList[0][1] else: orderAttribute = None statusDict = {} result = self.gJobDB.getCounters('Jobs', ['Status'], selectDict, newer=startDate, older=endDate, timeStamp='LastUpdateTime') nJobs = 0 if result['OK']: for stDict, count in result['Value']: nJobs += count statusDict[stDict['Status']] = count resultDict['TotalRecords'] = nJobs if nJobs == 0: return S_OK(resultDict) resultDict['Extras'] = statusDict if selectJobs: iniJob = startItem if iniJob >= nJobs: return S_ERROR('Item number out of range') result = self.gJobDB.selectJobs(selectDict, orderAttribute=orderAttribute, newer=startDate, older=endDate, limit=(maxItems, iniJob)) if not result['OK']: return S_ERROR('Failed to select jobs: ' + result['Message']) summaryJobList = result['Value'] if not self.globalJobsInfo: validJobs, _invalidJobs, _nonauthJobs, _ownJobs = self.jobPolicy.evaluateJobRights(summaryJobList, RIGHT_GET_INFO) summaryJobList = validJobs result = self.getAttributesForJobList(summaryJobList, SUMMARY) if not result['OK']: return S_ERROR('Failed to get job summary: ' + result['Message']) summaryDict = result['Value'] # Evaluate last sign of life time for jobID, jobDict in summaryDict.items(): if jobDict['HeartBeatTime'] == 'None': jobDict['LastSignOfLife'] = jobDict['LastUpdateTime'] else: lastTime = Time.fromString(jobDict['LastUpdateTime']) hbTime = Time.fromString(jobDict['HeartBeatTime']) # Not only Stalled jobs but also Failed jobs because Stalled if ((hbTime - lastTime) > timedelta(0) or jobDict['Status'] == "Stalled" or jobDict['MinorStatus'].startswith('Job stalled') or jobDict['MinorStatus'].startswith('Stalling')): jobDict['LastSignOfLife'] = jobDict['HeartBeatTime'] else: jobDict['LastSignOfLife'] = jobDict['LastUpdateTime'] tqDict = {} result = self.gTaskQueueDB.getTaskQueueForJobs(summaryJobList) if result['OK']: tqDict = result['Value'] # If no jobs can be selected after the properties check if not summaryDict.keys(): return S_OK(resultDict) # prepare the standard structure now key = summaryDict.keys()[0] paramNames = summaryDict[key].keys() records = [] for jobID, jobDict in summaryDict.items(): jParList = [] for pname in paramNames: jParList.append(jobDict[pname]) jParList.append(tqDict.get(jobID, 0)) records.append(jParList) resultDict['ParameterNames'] = paramNames + ['TaskQueueID'] resultDict['Records'] = records return S_OK(resultDict)
class PilotStatusAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled'] finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed'] def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.pilotDB = None self.diracadmin = None ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.pilots = PilotManagerClient() return S_OK() ############################################################################# def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3) self.gridEnv = self.am_getOption('GridEnv') if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '') result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() result = self.pilots.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay) if not result['OK']: self.log.warn('Failed to clear old pilots in the PilotAgentsDB') return S_OK() def clearWaitingPilots(self, condDict): """ Clear pilots in the faulty Waiting state """ last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour clearDict = { 'Status': 'Waiting', 'OwnerDN': condDict['OwnerDN'], 'OwnerGroup': condDict['OwnerGroup'], 'GridType': condDict['GridType'], 'Broker': condDict['Broker'] } result = self.pilotDB.selectPilots(clearDict, older=last_update) if not result['OK']: self.log.warn('Failed to get the Pilot Agents for Waiting state') return result if not result['Value']: return S_OK() refList = result['Value'] for pilotRef in refList: self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef) result = self.pilotDB.setPilotStatus( pilotRef, 'Stalled', statusReason='Exceeded max waiting time') return S_OK() def clearParentJob(self, pRef, pDict, connection): """ Clear the parameteric parent job from the PilotAgentsDB """ childList = pDict['ChildRefs'] # Check that at least one child is in the database children_ok = False for child in childList: result = self.pilotDB.getPilotInfo(child, conn=connection) if result['OK']: if result['Value']: children_ok = True if children_ok: return self.pilotDB.deletePilot(pRef, conn=connection) else: self.log.verbose('Adding children for parent %s' % pRef) result = self.pilotDB.getPilotInfo(pRef) parentInfo = result['Value'][pRef] tqID = parentInfo['TaskQueueID'] ownerDN = parentInfo['OwnerDN'] ownerGroup = parentInfo['OwnerGroup'] broker = parentInfo['Broker'] gridType = parentInfo['GridType'] result = self.pilotDB.addPilotTQReference(childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType) if not result['OK']: return result children_added = True for chRef, chDict in pDict['ChildDicts'].items(): result = self.pilotDB.setPilotStatus( chRef, chDict['Status'], destination=chDict['DestinationSite'], conn=connection) if not result['OK']: children_added = False if children_added: result = self.pilotDB.deletePilot(pRef, conn=connection) else: return S_ERROR('Failed to add children') return S_OK() def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays) result = self.pilotDB.selectPilots({'Status': self.queryStateList}, older=timeLimitToConsider, timeStamp='LastUpdateTime') if not result['OK']: self.log.error('Failed to get the Pilot Agents') return result if not result['Value']: return S_OK() refList = result['Value'] result = self.pilotDB.getPilotInfo(refList) if not result['OK']: self.log.error('Failed to get Info for Pilot Agents') return result pilotsDict = result['Value'] for pRef in pilotsDict: if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime( pilotsDict[pRef]['Jobs'], self.pilotStalledDays): self.log.debug( '%s should not be deleted since one job of %s is running.' % (str(pRef), str(pilotsDict[pRef]['Jobs']))) continue deletedJobDict = pilotsDict[pRef] deletedJobDict['Status'] = 'Deleted' deletedJobDict['StatusDate'] = Time.dateTime() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) return S_OK() def accountPilots(self, pilotsToAccount, connection): """ account for pilots """ accountingFlag = False pae = self.am_getOption('PilotAccountingEnabled', 'yes') if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info('No pilots to Account') return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo(list(pilotsToAccount.keys()), conn=connection) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal dbData = retVal['Value'] for pref in dbData: if pref in pilotsToAccount: if dbData[pref]['Status'] not in self.finalStateList: dbData[pref]['Status'] = pilotsToAccount[pref][ 'Status'] dbData[pref]['DestinationSite'] = pilotsToAccount[ pref]['DestinationSite'] dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref][ 'StatusDate'] retVal = self._addPilotsAccountingReport(dbData) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal self.log.info("Sending accounting records...") retVal = gDataStoreClient.commit() if not retVal['OK']: self.log.error("Can't send accounting reports", retVal['Message']) else: self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount)) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose('Setting Status for %s to %s' % (pRef, pDict['Status'])) self.pilotDB.setPilotStatus(pRef, pDict['Status'], pDict['DestinationSite'], pDict['StatusDate'], conn=connection) return S_OK() def _addPilotsAccountingReport(self, pilotsData): """ fill accounting data """ for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime(pData['LastUpdateTime']) pA.setStartTime(pData['SubmissionTime']) retVal = Registry.getUsernameForDN(pData['OwnerDN']) if not retVal['OK']: userName = '******' self.log.error("Can't determine username for dn:", pData['OwnerDN']) else: userName = retVal['Value'] pA.setValueByKey('User', userName) pA.setValueByKey('UserGroup', pData['OwnerGroup']) result = getCESiteMapping(pData['DestinationSite']) if result['OK'] and result['Value'].strip(): pA.setValueByKey( 'Site', result['Value'][pData['DestinationSite']].strip()) else: pA.setValueByKey('Site', 'Unknown') pA.setValueByKey('GridCE', pData['DestinationSite']) pA.setValueByKey('GridMiddleware', pData['GridType']) pA.setValueByKey('GridResourceBroker', pData['Broker']) pA.setValueByKey('GridStatus', pData['Status']) if 'Jobs' not in pData: pA.setValueByKey('Jobs', 0) else: pA.setValueByKey('Jobs', len(pData['Jobs'])) self.log.verbose("Added accounting record for pilot %s" % pData['PilotID']) retVal = gDataStoreClient.addRegister(pA) if not retVal['OK']: return retVal return S_OK() def _killPilots(self, acc): for i in sorted(acc.keys()): result = self.diracadmin.getPilotInfo(i) if result['OK'] and i in result['Value'] and 'Status' in result[ 'Value'][i]: ret = self.diracadmin.killPilot(str(i)) if ret['OK']: self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'])) else: self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret['Message'])) else: self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result))) def _checkJobLastUpdateTime(self, joblist, StalledDays): timeLimitToConsider = Time.dateTime() - Time.day * StalledDays ret = False for jobID in joblist: result = self.jobDB.getJobAttributes(int(jobID)) if result['OK']: if 'LastUpdateTime' in result['Value']: lastUpdateTime = result['Value']['LastUpdateTime'] if Time.fromString(lastUpdateTime) > timeLimitToConsider: ret = True self.log.debug( 'Since %s updates LastUpdateTime on %s this does not to need to be deleted.' % (str(jobID), str(lastUpdateTime))) break else: self.log.error("Error taking job info from DB", result['Message']) return ret