def export_removeRegisters(self, entriesList): """ Remove a record for a type """ setup = self.serviceInfoDict["clientSetup"] expectedTypes = [str, datetime.datetime, datetime.datetime, list] for entry in entriesList: if len(entry) != 4: return S_ERROR("Invalid records") for i in range(len(entry)): if not isinstance(entry[i], expectedTypes[i]): return S_ERROR("%s field in the records should be %s" % (i, expectedTypes[i])) ok = 0 for entry in entriesList: startTime = int(TimeUtilities.toEpoch(entry[1])) endTime = int(TimeUtilities.toEpoch(entry[2])) record = entry[3] result = self.__acDB.deleteRecord(setup, entry[0], startTime, endTime, record) # pylint: disable=no-member if not result["OK"]: return S_OK(ok) ok += 1 return S_OK(ok)
def generateDocs(data, withTimeStamp=True): """Generator for fast bulk indexing, yields docs :param list data: list of dictionaries :param bool withTimeStamp: add the timestamps to the docs :return: doc """ for doc in copy.deepcopy(data): if withTimeStamp: if "timestamp" not in doc: sLog.warn("timestamp is not given") # if the timestamp is not provided, we use the current utc time. timestamp = doc.get("timestamp", int(TimeUtilities.toEpoch())) try: if isinstance(timestamp, datetime): doc["timestamp"] = int(timestamp.strftime("%s")) * 1000 elif isinstance(timestamp, str): timeobj = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") doc["timestamp"] = int(timeobj.strftime("%s")) * 1000 else: # we assume the timestamp is an unix epoch time (integer). doc["timestamp"] = timestamp * 1000 except (TypeError, ValueError) as e: # in case we are not able to convert the timestamp to epoch time.... sLog.error("Wrong timestamp", e) doc["timestamp"] = int(TimeUtilities.toEpoch()) * 1000 sLog.debug("yielding %s" % doc) yield doc
def export_remove(self, typeName, startTime, endTime, valuesList): """ Remove a record for a type """ setup = self.serviceInfoDict["clientSetup"] startTime = int(TimeUtilities.toEpoch(startTime)) endTime = int(TimeUtilities.toEpoch(endTime)) return self.__acDB.deleteRecord(setup, typeName, startTime, endTime, valuesList) # pylint: disable=no-member
def export_commit(self, typeName, startTime, endTime, valuesList): """ Add a record for a type """ setup = self.serviceInfoDict["clientSetup"] startTime = int(TimeUtilities.toEpoch(startTime)) endTime = int(TimeUtilities.toEpoch(endTime)) return self.__acDB.insertRecordThroughQueue( # pylint: disable=no-member setup, typeName, startTime, endTime, valuesList)
def getUniqueValue(self, indexName, key, orderBy=False): """ :param str indexName: the name of the index which will be used for the query :param dict orderBy: it is a dictionary in case we want to order the result {key:'desc'} or {key:'asc'} :returns: a list of unique value for a certain key from the dictionary. """ query = self._Search(indexName) endDate = datetime.utcnow() startDate = endDate - timedelta(days=30) timeFilter = self._Q( "range", timestamp={ "lte": int(TimeUtilities.toEpoch(endDate)) * 1000, "gte": int(TimeUtilities.toEpoch(startDate)) * 1000, }, ) query = query.filter("bool", must=timeFilter) if orderBy: query.aggs.bucket(key, "terms", field=key, size=self.RESULT_SIZE, order=orderBy).metric(key, "cardinality", field=key) else: query.aggs.bucket(key, "terms", field=key, size=self.RESULT_SIZE).metric(key, "cardinality", field=key) try: query = query.extra( size=self.RESULT_SIZE) # do not need the raw data. sLog.debug("Query", query.to_dict()) result = query.execute() except TransportError as e: return S_ERROR(e) values = [] for bucket in result.aggregations[key].buckets: values += [bucket["key"]] del query sLog.debug("Nb of unique rows retrieved", len(values)) return S_OK(values)
def main(): params = Params() params.registerCLISwitches() Script.parseCommandLine(ignoreErrors=True) result = gProxyManager.getDBContents() if not result["OK"]: print("Can't retrieve list of users: %s" % result["Message"]) DIRAC.exit(1) keys = result["Value"]["ParameterNames"] records = result["Value"]["Records"] dataDict = {} now = datetime.datetime.utcnow() for record in records: expirationDate = record[3] dt = expirationDate - now secsLeft = dt.days * 86400 + dt.seconds if secsLeft > params.proxyLifeTime: userName, userDN, userGroup, _, persistent = record if userName not in dataDict: dataDict[userName] = [] dataDict[userName].append((userDN, userGroup, expirationDate, persistent)) for userName in dataDict: print("* %s" % userName) for iP in range(len(dataDict[userName])): data = dataDict[userName][iP] print(" DN : %s" % data[0]) print(" group : %s" % data[1]) print(" not after : %s" % TimeUtilities.toString(data[2])) print(" persistent : %s" % data[3]) if iP < len(dataDict[userName]) - 1: print(" -") DIRAC.exit(0)
def __reportActivity(self): initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring() pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) percentage = self.__endReportToMonitoring(initialWallTime, initialCPUTime) self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "MemoryUsage": mem, "CpuPercentage": percentage, "PendingQueries": pendingQueries, "ActiveQueries": activeQuereies, "RunningThreads": threading.activeCount(), "MaxFD": self.__maxFD, }) self.__maxFD = 0
def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ("Message", msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result["OK"]: return result result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] response = handlerObj._rh_executeMessageCallback(msgObj) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) if response["OK"]: return response["Value"][0] else: return response
def filterOngoing(selectOutput): """ Selects all the ongoing downtimes """ downtimes = selectOutput downtimesFiltered = [] currentDate = TimeUtilities.toEpoch(datetime.datetime.utcnow()) for dt in downtimes: dtStart = TimeUtilities.toEpoch(dt["startDate"]) dtEnd = TimeUtilities.toEpoch(dt["endDate"]) if (dtStart <= currentDate) and (dtEnd >= currentDate): downtimesFiltered.append(dt) return downtimesFiltered
def __reportToMonitoring(self, responseTime): """ Periodically reports to Monitoring """ # Calculate CPU usage by comparing realtime and cpu time since last report percentage = self.__endReportToMonitoringLoop(self.__report[0], self.__report[1]) # Send record to Monitoring self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "Tornado", "MemoryUsage": self.__report[2], "CpuPercentage": percentage, "ResponseTime": responseTime, }) self.activityMonitoringReporter.commit() # Save memory usage and save realtime/CPU time for next call self.__report = self.__startReportToMonitoringLoop()
def addLoggingRecord( self, jobID, status="idem", minorStatus="idem", applicationStatus="idem", date=None, source="Unknown", minor=None, application=None, ): """Add a new entry to the JobLoggingDB table. One, two or all the three status components (status, minorStatus, applicationStatus) can be specified. Optionally the time stamp of the status can be provided in a form of a string in a format '%Y-%m-%d %H:%M:%S' or as datetime.datetime object. If the time stamp is not provided the current UTC time is used. """ # Backward compatibility # FIXME: to remove in next version if minor: minorStatus = minor if application: applicationStatus = application event = "status/minor/app=%s/%s/%s" % (status, minorStatus, applicationStatus) self.log.info("Adding record for job ", str(jobID) + ": '" + event + "' from " + source) try: if not date: # Make the UTC datetime string and float _date = datetime.datetime.utcnow() elif isinstance(date, str): # The date is provided as a string in UTC _date = TimeUtilities.fromString(date) elif isinstance(date, datetime.datetime): _date = date else: self.log.error("Incorrect date for the logging record") _date = datetime.datetime.utcnow() except Exception: self.log.exception("Exception while date evaluation") _date = datetime.datetime.utcnow() epoc = time.mktime(_date.timetuple( )) + _date.microsecond / 1000000.0 - MAGIC_EPOC_NUMBER cmd = ( "INSERT INTO LoggingInfo (JobId, Status, MinorStatus, ApplicationStatus, " + "StatusTime, StatusTimeOrder, StatusSource) VALUES (%d,'%s','%s','%s','%s',%f,'%s')" % (int(jobID), status, minorStatus, applicationStatus[:255], str(_date), epoc, source[:32])) return self._update(cmd)
def getTypedValue(value, mtype): if mtype[0:3].lower() == "int": return int(value) elif mtype[0:5].lower() == "float": return float(value) elif mtype[0:4].lower() == "date": return TimeUtilities.fromString(value) else: return value
def __checkPlotRequest(self, reportRequest): """ It check the plot attributes. We have to make sure that all attributes which are needed are provided. :param dict reportRequest: contains the plot attributes. """ # If extraArgs is not there add it if "extraArgs" not in reportRequest: reportRequest["extraArgs"] = {} if not isinstance(reportRequest["extraArgs"], self.__reportRequestDict["extraArgs"]): return S_ERROR("Extra args has to be of type %s" % self.__reportRequestDict["extraArgs"]) reportRequestExtra = reportRequest["extraArgs"] # Check sliding plots if "lastSeconds" in reportRequestExtra: try: lastSeconds = int(reportRequestExtra["lastSeconds"]) except ValueError: gLogger.error("lastSeconds key must be a number") return S_ERROR("Value Error") if lastSeconds < 3600: return S_ERROR("lastSeconds must be more than 3600") now = datetime.datetime.utcnow() # this is an UTC time reportRequest["endTime"] = now reportRequest["startTime"] = now - datetime.timedelta( seconds=lastSeconds) else: # if end date is not there, just set it to now if not reportRequest.get("endTime"): # check the existence of the endTime it can be present and empty reportRequest["endTime"] = datetime.datetime.utcnow() # Check keys for key in self.__reportRequestDict: if key not in reportRequest: return S_ERROR("Missing mandatory field %s in plot request" % key) if not isinstance(reportRequest[key], self.__reportRequestDict[key]): return S_ERROR( "Type mismatch for field %s (%s), required one of %s" % (key, str(type(reportRequest[key])), str(self.__reportRequestDict[key]))) if key in ("startTime", "endTime"): reportRequest[key] = int( TimeUtilities.toEpoch(reportRequest[key])) return S_OK(reportRequest)
def export_commitRegisters(self, entriesList): """ Add a record for a type """ setup = self.serviceInfoDict["clientSetup"] expectedTypes = [str, datetime.datetime, datetime.datetime, list] for entry in entriesList: if len(entry) != 4: return S_ERROR("Invalid records") for i, _ in enumerate(entry): if not isinstance(entry[i], expectedTypes[i]): self.log.error( "Unexpected type in report", ": field %d in the records should be %s (and it is %s)" % (i, expectedTypes[i], type(entry[i])), ) return S_ERROR("Unexpected type in report") records = [] for entry in entriesList: startTime = int(TimeUtilities.toEpoch(entry[1])) endTime = int(TimeUtilities.toEpoch(entry[2])) self.log.debug("inserting", entry) records.append((setup, entry[0], startTime, endTime, entry[3])) return self.__acDB.insertRecordBundleThroughQueue(records)
def filterDate(selectOutput, start, end): """ Selects all the downtimes that meet the constraints of 'start' and 'end' dates """ downtimes = selectOutput downtimesFiltered = [] if start is not None: try: start = TimeUtilities.fromString(start) except Exception: error("datetime formt is incorrect, pls try [%Y-%m-%d[ %H:%M:%S]]") start = TimeUtilities.toEpoch(start) if end is not None: try: end = TimeUtilities.fromString(end) except Exception: error("datetime formt is incorrect, pls try [%Y-%m-%d[ %H:%M:%S]]") end = TimeUtilities.toEpoch(end) if start is not None and end is not None: for dt in downtimes: dtStart = TimeUtilities.toEpoch(dt["startDate"]) dtEnd = TimeUtilities.toEpoch(dt["endDate"]) if (dtStart >= start) and (dtEnd <= end): downtimesFiltered.append(dt) elif start is not None and end is None: for dt in downtimes: dtStart = TimeUtilities.toEpoch(dt["startDate"]) if dtStart >= start: downtimesFiltered.append(dt) elif start is None and end is not None: for dt in downtimes: dtEnd = TimeUtilities.toEpoch(dt["endDate"]) if dtEnd <= end: downtimesFiltered.append(dt) else: downtimesFiltered = downtimes return downtimesFiltered
def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = TimeUtilities.toString( datetime.datetime.utcnow() - TimeUtilities.day * self.pilotStalledDays) result = self.pilotDB.selectPilots( {"Status": PilotStatus.PILOT_TRANSIENT_STATES}, older=timeLimitToConsider, timeStamp="LastUpdateTime") if not result["OK"]: self.log.error("Failed to get the Pilot Agents") return result if not result["Value"]: return S_OK() refList = result["Value"] result = self.pilotDB.getPilotInfo(refList) if not result["OK"]: self.log.error("Failed to get Info for Pilot Agents") return result pilotsDict = result["Value"] for pRef in pilotsDict: if pilotsDict[pRef].get("Jobs") and self._checkJobLastUpdateTime( pilotsDict[pRef]["Jobs"], self.pilotStalledDays): self.log.debug( "%s should not be deleted since one job of %s is running." % (str(pRef), str(pilotsDict[pRef]["Jobs"]))) continue deletedJobDict = pilotsDict[pRef] deletedJobDict["Status"] = PilotStatus.DELETED deletedJobDict["StatusDate"] = datetime.datetime.utcnow() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) return S_OK()
def __checkPlotRequest(self, reportRequest): # If extraArgs is not there add it if "extraArgs" not in reportRequest: reportRequest["extraArgs"] = {} if not isinstance(reportRequest["extraArgs"], self.__reportRequestDict["extraArgs"]): return S_ERROR("Extra args has to be of type %s" % self.__reportRequestDict["extraArgs"]) reportRequestExtra = reportRequest["extraArgs"] # Check sliding plots if "lastSeconds" in reportRequestExtra: try: lastSeconds = int(reportRequestExtra["lastSeconds"]) except ValueError: gLogger.error("lastSeconds key must be a number") return S_ERROR("Value Error") if lastSeconds < 3600: return S_ERROR("lastSeconds must be more than 3600") now = datetime.datetime.utcnow() reportRequest["endTime"] = now reportRequest["startTime"] = now - datetime.timedelta( seconds=lastSeconds) else: # if enddate is not there, just set it to now if not reportRequest.get("endTime", False): reportRequest["endTime"] = datetime.datetime.utcnow() # Check keys for key in self.__reportRequestDict: if key not in reportRequest: return S_ERROR("Missing mandatory field %s in plot reques" % key) if not isinstance(reportRequest[key], self.__reportRequestDict[key]): return S_ERROR( "Type mismatch for field %s (%s), required one of %s" % (key, str(type(reportRequest[key])), str(self.__reportRequestDict[key]))) if key in ("startTime", "endTime"): reportRequest[key] = int( TimeUtilities.toEpoch(reportRequest[key])) return S_OK(reportRequest)
def _checkJobLastUpdateTime(self, joblist, StalledDays): timeLimitToConsider = datetime.datetime.utcnow( ) - TimeUtilities.day * StalledDays ret = False for jobID in joblist: result = self.jobDB.getJobAttributes(int(jobID)) if result["OK"]: if "LastUpdateTime" in result["Value"]: lastUpdateTime = result["Value"]["LastUpdateTime"] if TimeUtilities.fromString( lastUpdateTime) > timeLimitToConsider: ret = True self.log.debug( "Since %s updates LastUpdateTime on %s this does not to need to be deleted." % (str(jobID), str(lastUpdateTime))) break else: self.log.error("Error taking job info from DB", result["Message"]) return ret
def createRMSRecord(self, status, nbObject): """ This method is used to create a record given some parameters for sending it to the ES backend. It is used inside DMS/Agent/RequestOperations and this method is designed particularly for file type of objects. :param status: This can be one of these i.e. Attempted, Failed, or Successful. :param nbObject: This is number of objects in question. :returns: a dictionary. """ record = { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "File", "operationType": self.operation.Type, "status": status, "nbObject": nbObject, "parentID": self.operation.OperationID, } return record
def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if not response["OK"]: return response if self.activityMonitoring: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e))
def tabularPrint(table): columns_names = list(table[0]) records = [] for row in table: record = [] for _k, v in row.items(): if isinstance(v, datetime.datetime): record.append(TimeUtilities.toString(v)) elif v is None: record.append("") else: record.append(v) records.append(record) output = printTable(columns_names, records, numbering=False, columnSeparator=" | ", printOut=False) subLogger.notice(output)
def _getBucketLengthForTime(self, typeName, momentEpoch): nowEpoch = TimeUtilities.toEpoch() return self._acDB.calculateBucketLengthForTime(self._setup, typeName, nowEpoch, momentEpoch)
def execute(self): """Main execution method""" # PilotsHistory to Monitoring if "Monitoring" in self.pilotMonitoringOption: self.log.info("Committing PilotsHistory to Monitoring") result = PilotAgentsDB().getSummarySnapshot() now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the PilotAgentsDB summary", "%s: won't commit PilotsHistory at this cycle" % result["Message"], ) values = result["Value"][1] for record in values: rD = {} for iP, _ in enumerate(self.__pilotsMapping): rD[self.__pilotsMapping[iP]] = record[iP] rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.pilotReporter.addRecord(rD) self.log.info("Committing to Monitoring...") result = self.pilotReporter.commit() if not result["OK"]: self.log.error("Could not commit to Monitoring", result["Message"]) self.log.verbose("Done committing PilotsHistory to Monitoring") # WMSHistory to Monitoring or Accounting self.log.info("Committing WMSHistory to %s backend" % "and ".join(self.jobMonitoringOption)) result = JobDB().getSummarySnapshot(self.__jobDBFields) now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the JobDB summary", "%s: won't commit WMSHistory at this cycle" % result["Message"]) return S_ERROR() values = result["Value"][1] self.log.info("Start sending WMSHistory records") for record in values: record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP, _ in enumerate(self.__summaryKeyFieldsMapping): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP, _ in enumerate(self.__summaryValueFieldsMapping): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) for backend in self.datastores: if backend.lower() == "monitoring": rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.datastores["Monitoring"].addRecord(rD) elif backend.lower() == "accounting": acWMS = WMSHistory() acWMS.setStartTime(now) acWMS.setEndTime(now) acWMS.setValuesFromDict(rD) retVal = acWMS.checkValues() if not retVal["OK"]: self.log.error("Invalid WMSHistory accounting record ", "%s -> %s" % (retVal["Message"], rD)) else: self.datastores["Accounting"].addRegister(acWMS) for backend, datastore in self.datastores.items(): self.log.info("Committing WMSHistory records to %s backend" % backend) result = datastore.commit() if not result["OK"]: self.log.error("Couldn't commit WMSHistory to %s" % backend, result["Message"]) return S_ERROR() self.log.verbose("Done committing WMSHistory to %s backend" % backend) return S_OK()
def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result["OK"]: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties["fullName"]) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties["cyclesDone"] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() if self.activityMonitoring: initialWallTime, initialCPUTime, mem = self._startReportToMonitoring( ) cycleResult = self.__executeModuleCycle() if self.activityMonitoring and initialWallTime and initialCPUTime: cpuPercentage = self._endReportToMonitoring( initialWallTime, initialCPUTime) # Increment counters self.__moduleProperties["cyclesDone"] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties["totalElapsedTime"] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties["fullName"]) self.log.notice(" Executed %s times previously" % self.__moduleProperties["cyclesDone"]) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ "totalElapsedTime"] / self.__moduleProperties["cyclesDone"] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( "PollingTime") self.log.notice(" Polling time: %s seconds" % self.am_getOption("PollingTime")) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult["OK"]: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # agent and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ "AgentName": self.agentName, "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "MemoryUsage": mem, "CpuPercentage": cpuPercentage, "CycleDuration": elapsedTime, }) else: self.log.warn(" Cycle had an error:", cycleResult["Message"]) self.log.notice("-" * 40) # Update number of cycles # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult
def _setJobStatusBulk(cls, jobID, statusDict, force=False): """Set various status fields for job specified by its jobId. Set only the last status in the JobDB, updating all the status logging information in the JobLoggingDB. The statusDict has datetime as a key and status information dictionary as values """ jobID = int(jobID) log = cls.log.getLocalSubLogger("JobStatusBulk/Job-%d" % jobID) result = cls.jobDB.getJobAttributes(jobID, ["Status", "StartExecTime", "EndExecTime"]) if not result["OK"]: return result if not result["Value"]: # if there is no matching Job it returns an empty dictionary return S_ERROR("No Matching Job") # If the current status is Stalled and we get an update, it should probably be "Running" currentStatus = result["Value"]["Status"] if currentStatus == JobStatus.STALLED: currentStatus = JobStatus.RUNNING startTime = result["Value"].get("StartExecTime") endTime = result["Value"].get("EndExecTime") # getJobAttributes only returns strings :( if startTime == "None": startTime = None if endTime == "None": endTime = None # Remove useless items in order to make it simpler later, although there should not be any for sDict in statusDict.values(): for item in sorted(sDict): if not sDict[item]: sDict.pop(item, None) # Get the latest time stamps of major status updates result = cls.jobLoggingDB.getWMSTimeStamps(int(jobID)) if not result["OK"]: return result if not result["Value"]: return S_ERROR("No registered WMS timeStamps") # This is more precise than "LastTime". timeStamps is a sorted list of tuples... timeStamps = sorted((float(t), s) for s, t in result["Value"].items() if s != "LastTime") lastTime = TimeUtilities.toString(TimeUtilities.fromEpoch(timeStamps[-1][0])) # Get chronological order of new updates updateTimes = sorted(statusDict) log.debug("*** New call ***", "Last update time %s - Sorted new times %s" % (lastTime, updateTimes)) # Get the status (if any) at the time of the first update newStat = "" firstUpdate = TimeUtilities.toEpoch(TimeUtilities.fromString(updateTimes[0])) for ts, st in timeStamps: if firstUpdate >= ts: newStat = st # Pick up start and end times from all updates for updTime in updateTimes: sDict = statusDict[updTime] newStat = sDict.get("Status", newStat) if not startTime and newStat == JobStatus.RUNNING: # Pick up the start date when the job starts running if not existing startTime = updTime log.debug("Set job start time", startTime) elif not endTime and newStat in JobStatus.JOB_FINAL_STATES: # Pick up the end time when the job is in a final status endTime = updTime log.debug("Set job end time", endTime) # We should only update the status to the last one if its time stamp is more recent than the last update attrNames = [] attrValues = [] if updateTimes[-1] >= lastTime: minor = "" application = "" # Get the last status values looping on the most recent upupdateTimes in chronological order for updTime in [dt for dt in updateTimes if dt >= lastTime]: sDict = statusDict[updTime] log.debug("\t", "Time %s - Statuses %s" % (updTime, str(sDict))) status = sDict.get("Status", currentStatus) # evaluate the state machine if the status is changing if not force and status != currentStatus: res = JobStatus.JobsStateMachine(currentStatus).getNextState(status) if not res["OK"]: return res newStat = res["Value"] # If the JobsStateMachine does not accept the candidate, don't update if newStat != status: # keeping the same status log.error( "Job Status Error", "%s can't move from %s to %s: using %s" % (jobID, currentStatus, status, newStat), ) status = newStat sDict["Status"] = newStat # Change the source to indicate this is not what was requested source = sDict.get("Source", "") sDict["Source"] = source + "(SM)" # at this stage status == newStat. Set currentStatus to this new status currentStatus = newStat minor = sDict.get("MinorStatus", minor) application = sDict.get("ApplicationStatus", application) log.debug("Final statuses:", "status '%s', minor '%s', application '%s'" % (status, minor, application)) if status: attrNames.append("Status") attrValues.append(status) if minor: attrNames.append("MinorStatus") attrValues.append(minor) if application: attrNames.append("ApplicationStatus") attrValues.append(application) # Here we are forcing the update as it's always updating to the last status result = cls.jobDB.setJobAttributes(jobID, attrNames, attrValues, update=True, force=True) if not result["OK"]: return result # Update start and end time if needed if endTime: result = cls.jobDB.setEndExecTime(jobID, endTime) if not result["OK"]: return result if startTime: result = cls.jobDB.setStartExecTime(jobID, startTime) if not result["OK"]: return result # Update the JobLoggingDB records heartBeatTime = None for updTime in updateTimes: sDict = statusDict[updTime] status = sDict.get("Status", "idem") minor = sDict.get("MinorStatus", "idem") application = sDict.get("ApplicationStatus", "idem") source = sDict.get("Source", "Unknown") result = cls.jobLoggingDB.addLoggingRecord( jobID, status=status, minorStatus=minor, applicationStatus=application, date=updTime, source=source ) if not result["OK"]: return result # If the update comes from a job, update the heart beat time stamp with this item's stamp if source.startswith("Job"): heartBeatTime = updTime if heartBeatTime is not None: result = cls.jobDB.setHeartBeatData(jobID, {"HeartBeatTime": heartBeatTime}) if not result["OK"]: return result return S_OK((attrNames, attrValues))
def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK()
def __call__(self): """request processing""" self.log.debug("about to execute request") # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName(self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, } ) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG("/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG("/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, } ) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, } ) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient(useCertificates=True) res = monitorServer.getJobSummary(int(self.request.JobID)) if not res["OK"]: self.log.error("RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as error: self.log.exception("hit by exception:", error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, } ) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, } ) if useServerCertificate: gConfigurationData.setOptionInCFG("/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, } ) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, } ) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, } ) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
def export_getJobPageSummaryWeb(self, selectDict, sortList, startItem, maxItems, selectJobs=True): """Get the summary of the job information for a given page in the job monitor in a generic format """ resultDict = {} startDate, endDate, selectDict = self.parseSelectors(selectDict) # initialize jobPolicy credDict = self.getRemoteCredentials() ownerDN = credDict["DN"] ownerGroup = credDict["group"] operations = Operations(group=ownerGroup) globalJobsInfo = operations.getValue( "/Services/JobMonitoring/GlobalJobsInfo", True) jobPolicy = JobPolicy(ownerDN, ownerGroup, globalJobsInfo) jobPolicy.jobDB = self.jobDB result = jobPolicy.getControlledUsers(RIGHT_GET_INFO) if not result["OK"]: return result if not result["Value"]: return S_ERROR( "User and group combination has no job rights (%r, %r)" % (ownerDN, ownerGroup)) if result["Value"] != "ALL": selectDict[("Owner", "OwnerGroup")] = result["Value"] # Sorting instructions. Only one for the moment. if sortList: orderAttribute = sortList[0][0] + ":" + sortList[0][1] else: orderAttribute = None result = self.jobDB.getCounters("Jobs", ["Status"], selectDict, newer=startDate, older=endDate, timeStamp="LastUpdateTime") if not result["OK"]: return result statusDict = {} nJobs = 0 for stDict, count in result["Value"]: nJobs += count statusDict[stDict["Status"]] = count resultDict["TotalRecords"] = nJobs if nJobs == 0: return S_OK(resultDict) resultDict["Extras"] = statusDict if selectJobs: iniJob = startItem if iniJob >= nJobs: return S_ERROR("Item number out of range") result = self.jobDB.selectJobs(selectDict, orderAttribute=orderAttribute, newer=startDate, older=endDate, limit=(maxItems, iniJob)) if not result["OK"]: return result summaryJobList = result["Value"] if not globalJobsInfo: validJobs, _invalidJobs, _nonauthJobs, _ownJobs = jobPolicy.evaluateJobRights( summaryJobList, RIGHT_GET_INFO) summaryJobList = validJobs result = self.getJobsAttributes(summaryJobList, SUMMARY) if not result["OK"]: return result summaryDict = result["Value"] # If no jobs can be selected after the properties check if not summaryDict: return S_OK(resultDict) # Evaluate last sign of life time for jobDict in summaryDict.values(): if not jobDict.get( "HeartBeatTime") or jobDict["HeartBeatTime"] == "None": jobDict["LastSignOfLife"] = jobDict["LastUpdateTime"] elif False: # Code kept in case this is not working, but if we update the HeartBeatTime # at each status change from the jobs it should not be needed # Items are always strings lastTime = TimeUtilities.fromString( jobDict["LastUpdateTime"]) hbTime = TimeUtilities.fromString(jobDict["HeartBeatTime"]) # Try and identify statuses not set by the job itself as too expensive to get logging info # Not only Stalled jobs but also Failed jobs because Stalled if (hbTime > lastTime or jobDict["Status"] == JobStatus.STALLED or jobDict["MinorStatus"] in ( JobMinorStatus.REQUESTS_DONE, JobMinorStatus.STALLED_PILOT_NOT_RUNNING, ) or jobDict["MinorStatus"].startswith("Stalling")): jobDict["LastSignOfLife"] = jobDict["HeartBeatTime"] else: jobDict["LastSignOfLife"] = jobDict["LastUpdateTime"] else: jobDict["LastSignOfLife"] = jobDict["HeartBeatTime"] # prepare the standard structure now # This should be faster than making a list of values() for jobDict in summaryDict.values(): paramNames = list(jobDict) break records = [ list(jobDict.values()) for jobDict in summaryDict.values() ] resultDict["ParameterNames"] = paramNames resultDict["Records"] = records return S_OK(resultDict)