def __discoverURL(self): """ Calculate the final URL. It is called at initialization and in connect in case of issue It sets: * self.serviceURL: the url (dips) selected as target using __findServiceURL * self.__URLTuple: a split of serviceURL obtained by Network.splitURL * self._serviceName: the last part of URLTuple (typically System/Component) """ # Calculate final URL try: result = self.__findServiceURL() except Exception as e: return S_ERROR(repr(e)) if not result['OK']: return result self.serviceURL = result['Value'] retVal = Network.splitURL(self.serviceURL) if not retVal['OK']: return retVal self.__URLTuple = retVal['Value'] self._serviceName = self.__URLTuple[-1] res = gConfig.getOptionsDict("/DIRAC/ConnConf/%s:%s" % self.__URLTuple[1:3]) if res['OK']: opts = res['Value'] for k in opts: if k not in self.kwargs: self.kwargs[k] = opts[k] return S_OK()
def __reduceComponentList( self, componentList ): """ Only keep the most restrictive components """ for i in range( len( componentList ) ): component = componentList[i] for j in range( len( componentList ) ): if i == j or componentList[j] == False : continue potentiallyMoreRestrictiveComponent = componentList[j] match = True for key in component: if key not in potentiallyMoreRestrictiveComponent: match = False break if key == 'Host': result = Network.checkHostsMatch( component[key], potentiallyMoreRestrictiveComponent[key] ) if not result[ 'OK' ] or not result[ 'Value' ]: match = False break else: if component[key] != potentiallyMoreRestrictiveComponent[key]: match = False break if match: componentList[i] = False break return [ comp for comp in componentList if comp != False ]
def getSocket( self, hostAddress, **kwargs ): hostName = hostAddress[0] retVal = self.generateClientInfo( hostName, kwargs ) if not retVal[ 'OK' ]: return retVal socketInfo = retVal[ 'Value' ] retVal = Network.getIPsForHostName( hostName ) if not retVal[ 'OK' ]: return S_ERROR( "Could not resolve %s: %s" % ( hostName, retVal[ 'Message' ] ) ) ipList = List.randomize( retVal[ 'Value' ] ) for i in range( 3 ): connected = False errorsList = [] for ip in ipList : ipAddress = ( ip, hostAddress[1] ) retVal = self.__connect( socketInfo, ipAddress ) if retVal[ 'OK' ]: sslSocket = retVal[ 'Value' ] connected = True break errorsList.append( "%s: %s" % ( ipAddress, retVal[ 'Message' ] ) ) if not connected: return S_ERROR( "Could not connect to %s: %s" % ( hostAddress, "," .join( [ e for e in errorsList ] ) ) ) retVal = socketInfo.doClientHandshake() if retVal[ 'OK' ]: #Everything went ok. Don't need to retry break #Did the auth or the connection fail? if not retVal['OK']: return retVal if 'enableSessions' in kwargs and kwargs[ 'enableSessions' ]: sessionId = hash( hostAddress ) gSessionManager.set( sessionId, sslSocket.get_session() ) return S_OK( socketInfo )
def initialize( self ): self.logger = gLogger.getSubLogger( "Monitoring" ) self.logger.debug( "Initializing Monitoring Client" ) self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" ) self.sourceDict[ 'site' ] = DIRAC.siteName() if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] ) self.setComponentLocation( Network.getFQDN() ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB: self.cfgSection = "/WebApp" self.setComponentLocation( 'http://%s' % Network.getFQDN() ) self.setComponentName( 'WebApp' ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" else: raise Exception( "Component type has not been defined" ) gMonitoringFlusher.registerMonitoringClient( self ) # ExitCallback.registerExitCallback( self.forceFlush ) self.__initialized = True
def initialize(self): self.am_disableMonitoring() #Init vars self.runningPod = gConfig.getValue('/LocalSite/RunningPod') self.log.info("Running pod name of the image is %s" % self.runningPod) self.vmID = gConfig.getValue('/LocalSite/VMID') self.__loadHistory = [] self.vmMinWorkingLoad = None self.vmLoadAvgTimespan = None self.vmJobWrappersLocation = None self.haltPeriod = None self.haltBeforeMargin = None self.heartBeatPeriod = None self.am_setOption("MaxCycles", 0) self.am_setOption("PollingTime", 60) #Discover net address netData = Network.discoverInterfaces() for iface in sorted(netData): if iface.find("eth") == 0: self.ipAddress = netData[iface]['ip'] break self.log.info("IP Address is %s" % self.ipAddress) #getting the stop policy self.op = Operations.Operations() self.vmStopPolicy = self.op.getValue("Cloud/%s/VMStopPolicy", 'elastic') self.log.info("vmStopPolicy is %s" % self.vmStopPolicy) #Declare instance running self.uniqueID = '' result = virtualMachineDB.getUniqueIDByName(self.vmID) if result['OK']: self.uniqueID = result['Value'] result = self.__declareInstanceRunning() if not result['OK']: self.log.error("Could not declare instance running", result['Message']) self.__haltInstance() return S_ERROR("Halting!") self.__instanceInfo = result['Value'] #Get the cs config result = self.__getCSConfig() if not result['OK']: return result return S_OK()
def initialize(self): self.logger = gLogger.getSubLogger("Monitoring") self.logger.debug("Initializing Monitoring Client") self.sourceDict['setup'] = gConfig.getValue("/DIRAC/Setup") self.sourceDict['site'] = DIRAC.siteName() if self.sourceDict['componentType'] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict['componentName']) elif self.sourceDict['componentType'] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict['componentName']) self.setComponentLocation(Network.getFQDN()) elif self.sourceDict['componentType'] == self.COMPONENT_WEB: self.cfgSection = "/Website" self.setComponentLocation('http://%s' % Network.getFQDN()) self.setComponentName('Web') elif self.sourceDict['componentType'] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" else: raise Exception("Component type has not been defined") gMonitoringFlusher.registerMonitoringClient(self) # ExitCallback.registerExitCallback( self.forceFlush ) self.__initialized = True
def __addFoundDefinedComponent(self, compDictList): cD = self.walkSet(self.__requiredSet, compDictList[0]) dbD = self.walkSet(self.__dbSet, compDictList[0]) now = Time.dateTime() unmatched = compDictList for dbComp in dbD: if "Status" not in dbComp: self.__setStatus(dbComp, "OK") if dbComp["Type"] == "service": if "Port" not in dbComp: self.__setStatus(dbComp, "Error", "Port is not defined") elif dbComp["Port"] not in [compDict["Port"] for compDict in compDictList if "Port" in compDict]: self.__setStatus( compDictList[-1], "Error", "Port (%s) is different that specified in the CS" % dbComp["Port"], ) elapsed = now - dbComp["LastHeartbeat"] elapsed = elapsed.days * 86400 + elapsed.seconds if elapsed > self.__maxSecsSinceHeartbeat: self.__setStatus( dbComp, "Error", "Last heartbeat was received at %s (%s secs ago)" % (dbComp["LastHeartbeat"], elapsed), ) cD.append(dbComp) # See if we have a perfect match newUnmatched = [] for unmatchedComp in unmatched: perfectMatch = True for field in unmatchedComp: if field in ("Status", "Message"): continue if field not in dbComp: perfectMatch = False continue if field == "Host": result = Network.checkHostsMatch(unmatchedComp[field], dbComp[field]) if not result["OK"] or not result["Value"]: perfectMatch = False else: if unmatchedComp[field] != dbComp[field]: perfectMatch = False if not perfectMatch: newUnmatched.append(unmatchedComp) unmatched = newUnmatched for unmatchedComp in unmatched: self.__setStatus(unmatchedComp, "Error", "There is no component up with this properties") cD.append(unmatchedComp)
def __init__( self, optionsDictionary ): threading.Thread.__init__( self ) self.__interactive = optionsDictionary[ 'Interactive' ] self.__sleep = optionsDictionary[ 'SleepTime' ] self._messageQueue = Queue.Queue() self._Transactions = [] self._alive = True self._site = optionsDictionary[ 'Site' ] self._hostname = Network.getFQDN() self._logLevels = LogLevels() self._negativeLevel = self._logLevels.getLevelValue( 'ERROR' ) self._positiveLevel = self._logLevels.getLevelValue( 'ALWAYS' ) self._maxBundledMessages = 20 self.setDaemon(1) self.start()
def initialize(self): self.logger = gLogger.getSubLogger("Monitoring") self.logger.debug("Initializing Monitoring Client") self.sourceDict["setup"] = gConfig.getValue("/DIRAC/Setup") self.sourceDict["site"] = DIRAC.siteName() if self.sourceDict["componentType"] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict["componentName"]) elif self.sourceDict["componentType"] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict["componentName"]) self.setComponentLocation(Network.getFQDN()) elif self.sourceDict["componentType"] == self.COMPONENT_WEB: self.cfgSection = "/WebApp" self.setComponentLocation("http://%s" % Network.getFQDN()) self.setComponentName("WebApp") elif self.sourceDict["componentType"] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" elif self.sourceDict["componentType"] == self.COMPONENT_TORNADO: self.cfgSection = "/Tornado" else: raise Exception("Component type has not been defined") gMonitoringFlusher.registerMonitoringClient(self) self.__initialized = True
def __selectUrl(self, notselect, urls): """In case when multiple services are running in the same host, a new url has to be in a different host Note: If we do not have different host we will use the selected url... """ url = None for i in urls: retVal = Network.splitURL(i) if retVal['OK']: if retVal['Value'][1] != notselect[1]: # the hots are different url = i break else: gLogger.error(retVal['Message']) return url
def __selectUrl( self, notselect, urls ): """In case when multiple services are running in the same host, a new url has to be in a different host Note: If we do not have different host we will use the selected url... """ url = None for i in urls: retVal = Network.splitURL( i ) if retVal['OK']: if retVal['Value'][1] != notselect[1]: # the hots are different url = i break else: gLogger.error( retVal['Message'] ) return url
def __init__( self, optionsDictionary ): threading.Thread.__init__( self ) self.__interactive = optionsDictionary[ 'Interactive' ] self.__sleep = optionsDictionary[ 'SleepTime' ] self._messageQueue = Queue.Queue() self._Transactions = [] self._alive = True self._site = optionsDictionary[ 'Site' ] self._hostname = Network.getFQDN() self._logLevels = LogLevels() self._negativeLevel = self._logLevels.getLevelValue( 'ERROR' ) self._positiveLevel = self._logLevels.getLevelValue( 'ALWAYS' ) self._maxBundledMessages = 20 self.setDaemon(1) self.start()
def initialize(self): self.am_disableMonitoring() self.op = Operations.Operations() # Init vars self.runningPod = gConfig.getValue("/LocalSite/RunningPod") self.log.info("Running pod name of the image is %s" % self.runningPod) self.vmID = gConfig.getValue("/LocalSite/VMID") self.__loadHistory = [] self.vmLoadAvgTimespan = None self.vmJobWrappersLocation = None self.haltPeriod = None self.haltBeforeMargin = None self.heartBeatPeriod = None self.am_setOption("MaxCycles", 0) self.am_setOption("PollingTime", 60) # Discover net address self.ipAddress = None netData = Network.discoverInterfaces() for iface in sorted(netData): # Warning! On different clouds interface name may be different(eth, ens, ...) if "eth" in iface or "ens" in iface: self.ipAddress = netData[iface]["ip"] self.log.info("IP Address is %s" % self.ipAddress) break # Declare instance running self.uniqueID = "" result = virtualMachineDB.getUniqueIDByName(self.vmID) if result["OK"]: self.uniqueID = result["Value"] result = self.__declareInstanceRunning() if not result["OK"]: self.log.error("Could not declare instance running", result["Message"]) self.__haltInstance() return S_ERROR("Halting!") self.__instanceInfo = result["Value"] # Get the cs config result = self.__getCSConfig() if not result["OK"]: return result return S_OK()
def __addFoundDefinedComponent( self, compDictList ): cD = self.walkSet( self.__requiredSet, compDictList[0] ) dbD = self.walkSet( self.__dbSet, compDictList[0] ) now = Time.dateTime() unmatched = compDictList for dbComp in dbD: if 'Status' not in dbComp: self.__setStatus( dbComp, 'OK' ) if dbComp[ 'Type' ] == "service": if 'Port' not in dbComp: self.__setStatus( dbComp, 'Error', "Port is not defined" ) elif dbComp[ 'Port' ] not in [ compDict[ 'Port' ] for compDict in compDictList if 'Port' in compDict ]: self.__setStatus( compDict, 'Error', "Port (%s) is different that specified in the CS" % dbComp[ 'Port' ] ) elapsed = now - dbComp[ 'LastHeartbeat' ] elapsed = elapsed.days * 86400 + elapsed.seconds if elapsed > self.__maxSecsSinceHeartbeat: self.__setStatus( dbComp, "Error", "Last heartbeat was received at %s (%s secs ago)" % ( dbComp[ 'LastHeartbeat' ], elapsed ) ) cD.append( dbComp ) #See if we have a perfect match newUnmatched = [] for unmatchedComp in unmatched: perfectMatch = True for field in unmatchedComp: if field in ( 'Status', 'Message' ): continue if field not in dbComp: perfectMatch = False continue if field == 'Host': result = Network.checkHostsMatch( unmatchedComp[ field ], dbComp[ field ] ) if not result[ 'OK' ] or not result[ 'Value' ]: perfectMatch = False else: if unmatchedComp[ field ] != dbComp[ field ]: perfectMatch = False if not perfectMatch: newUnmatched.append( unmatchedComp ) unmatched = newUnmatched for unmatchedComp in unmatched: self.__setStatus( unmatchedComp, "Error", "There is no component up with this properties" ) cD.append( unmatchedComp )
def getGenericVMId(): fd = open( "/proc/stat" ) lines = fd.readlines() fd.close() btime = False for line in lines: fields = List.fromChar( line, " " ) if fields[0] == "btime": btime = fields[1] break if not btime: return S_ERROR( "Could not find btime in /proc/stat" ) md5Hash = md5() md5Hash.update( btime ) netData = Network.discoverInterfaces() for iface in sorted( netData ): if iface == "lo": continue md5Hash.update( netData[ iface ][ 'mac' ] ) return S_OK( md5Hash.hexdigest() )
def __selectUrl(self, notselect, urls): """In case when multiple services are running in the same host, a new url has to be in a different host Note: If we do not have different host we will use the selected url... :param notselect: URL that should NOT be selected :param list urls: list of potential URLs :return: str -- selected URL """ url = None for i in urls: retVal = Network.splitURL(i) if retVal["OK"]: if retVal["Value"][1] != notselect[ 1]: # the hosts are different url = i break else: gLogger.error(retVal["Message"]) return url
def __selectUrl(self, notselect, urls): """In case when multiple services are running in the same host, a new url has to be in a different host Note: If we do not have different host we will use the selected url... :param notselect: URL that should NOT be selected :param urls: list of potential URLs :return: selected URL WARNING: COPY/PASTE FROM Core/Diset/private/BaseClient """ url = None for i in urls: retVal = Network.splitURL(i) if retVal['OK']: if retVal['Value'][1] != notselect[1]: # the hots are different url = i break else: gLogger.error(retVal['Message']) return url
def __discoverURL( self ): #Calculate final URL try: result = self.__findServiceURL() except Exception as e: return S_ERROR( repr( e ) ) if not result[ 'OK' ]: return result self.serviceURL = result[ 'Value' ] retVal = Network.splitURL( self.serviceURL ) if not retVal[ 'OK' ]: return retVal self.__URLTuple = retVal[ 'Value' ] self._serviceName = self.__URLTuple[-1] res = gConfig.getOptionsDict( "/DIRAC/ConnConf/%s:%s" % self.__URLTuple[1:3] ) if res[ 'OK' ]: opts = res[ 'Value' ] for k in opts: if k not in self.kwargs: self.kwargs[k] = opts[k] return S_OK()
def __discoverURL(self): #Calculate final URL try: result = self.__findServiceURL() except Exception as e: return S_ERROR(repr(e)) if not result['OK']: return result self.serviceURL = result['Value'] retVal = Network.splitURL(self.serviceURL) if not retVal['OK']: return retVal self.__URLTuple = retVal['Value'] self._serviceName = self.__URLTuple[-1] res = gConfig.getOptionsDict("/DIRAC/ConnConf/%s:%s" % self.__URLTuple[1:3]) if res['OK']: opts = res['Value'] for k in opts: if k not in self.kwargs: self.kwargs[k] = opts[k] return S_OK()
def createRMSRecord(self, status, nbObject): """ This method is used to create a record given some parameters for sending it to the ES backend. It is used inside DMS/Agent/RequestOperations and this method is designed particularly for file type of objects. :param status: This can be one of these i.e. Attempted, Failed, or Successful. :param nbObject: This is number of objects in question. :returns: a dictionary. """ record = { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "File", "operationType": self.operation.Type, "status": status, "nbObject": nbObject, "parentID": self.operation.OperationID, } return record
def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': 'service', 'component': "_".join(self._name.split("/")), 'componentLocation': self._cfg.getURL(), 'ServiceResponseTime': response["Value"][1] }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e))
def siteName(): """ Determine and return DIRAC name for current site """ global __siteName if not __siteName: #FIXME: does this ever happen that we have to use the defaultValue if getValue ??? from DIRAC.Core.Utilities import Network # Some Defaults if not present in the configuration fqdn = Network.getFQDN() if len( fqdn.split( '.' ) ) > 2 : # Use the last component of the FQDN as country code if there are more than 2 components _siteName = 'DIRAC.Client.%s' % fqdn.split( '.' )[-1] else: # else use local as country code _siteName = 'DIRAC.Client.local' __siteName = gConfig.getValue( '/LocalSite/Site', _siteName ) return __siteName
def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if not response["OK"]: return response if self.activityMonitoring: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e))
def getSocket(self, hostAddress, **kwargs): hostName = hostAddress[0] retVal = self.generateClientInfo(hostName, kwargs) if not retVal['OK']: return retVal socketInfo = retVal['Value'] retVal = Network.getIPsForHostName(hostName) if not retVal['OK']: return S_ERROR("Could not resolve %s: %s" % (hostName, retVal['Message'])) ipList = retVal[ 'Value'] #In that case the first ip always the correct one. for _ in xrange(1): #TODO: this retry can be reduced. connected = False errorsList = [] for ip in ipList: ipAddress = (ip, hostAddress[1]) retVal = self.__connect(socketInfo, ipAddress) if retVal['OK']: sslSocket = retVal['Value'] connected = True break errorsList.append("%s: %s" % (ipAddress, retVal['Message'])) if not connected: return S_ERROR("Could not connect to %s: %s" % (hostAddress, ",".join([e for e in errorsList]))) retVal = socketInfo.doClientHandshake() if retVal['OK']: #Everything went ok. Don't need to retry break #Did the auth or the connection fail? if not retVal['OK']: return retVal if 'enableSessions' in kwargs and kwargs['enableSessions']: sessionId = hash(hostAddress) gSessionManager.set(sessionId, sslSocket.get_session()) return S_OK(socketInfo)
def __init__(self, sleepTime, interactive, site): """ Initialization of the ServerHandler. The queue is initialized with the hostname and the start of the thread. :params sleepTime: integer, representing time in seconds where the handler can send messages. :params interactive: not used at the moment. :params site: the site where the log messages come from. """ super(ServerHandler, self).__init__() threading.Thread.__init__(self) self.__logQueue = Queue.Queue() self.__sleepTime = sleepTime self.__interactive = interactive self.__site = site self.__transactions = [] self.__hostname = Network.getFQDN() self.__alive = True self.__maxBundledLogs = 20 self.setDaemon(True) self.start()
def __reduceComponentList(self, componentList): """ Only keep the most restrictive components. :type componentList: list :param componentList: A list of components. :return: A list of reduced components. """ for i in range(len(componentList)): component = componentList[i] for j in range(len(componentList)): if i == j or componentList[j] is False: continue potentiallyMoreRestrictiveComponent = componentList[j] match = True for key in component: if key not in potentiallyMoreRestrictiveComponent: match = False break if key == 'Host': result = Network.checkHostsMatch( component[key], potentiallyMoreRestrictiveComponent[key]) if not result['OK'] or not result['Value']: match = False break else: if component[ key] != potentiallyMoreRestrictiveComponent[ key]: match = False break if match: componentList[i] = False break return [comp for comp in componentList if comp]
class BaseClient: VAL_EXTRA_CREDENTIALS_HOST = "hosts" KW_USE_CERTIFICATES = "useCertificates" KW_EXTRA_CREDENTIALS = "extraCredentials" KW_TIMEOUT = "timeout" KW_SETUP = "setup" KW_VO = "VO" KW_DELEGATED_DN = "delegatedDN" KW_DELEGATED_GROUP = "delegatedGroup" KW_IGNORE_GATEWAYS = "ignoreGateways" KW_PROXY_LOCATION = "proxyLocation" KW_PROXY_STRING = "proxyString" KW_PROXY_CHAIN = "proxyChain" KW_SKIP_CA_CHECK = "skipCACheck" KW_KEEP_ALIVE_LAPSE = "keepAliveLapse" def __init__(self, serviceName, **kwargs): if type(serviceName) != types.StringType: raise TypeError( "Service name expected to be a string. Received %s type %s" % (str(serviceName), type(serviceName))) self._destinationSrv = serviceName self.kwargs = kwargs self.__initStatus = S_OK() self.__idDict = {} self.__enableThreadCheck = False for initFunc in (self.__discoverSetup, self.__discoverVO, self.__discoverTimeout, self.__discoverURL, self.__discoverCredentialsToUse, self.__discoverExtraCredentials, self.__checkTransportSanity, self.__setKeepAliveLapse): result = initFunc() if not result['OK'] and self.__initStatus['OK']: self.__initStatus = result self._initialize() #HACK for thread-safety: self.__allowedThreadID = False def _initialize(self): pass def getDestinationService(self): return self._destinationSrv def __discoverSetup(self): #Which setup to use? if self.KW_SETUP in self.kwargs and self.kwargs[self.KW_SETUP]: self.setup = str(self.kwargs[self.KW_SETUP]) else: self.setup = gConfig.getValue("/DIRAC/Setup", "Test") return S_OK() def __discoverVO(self): #Which setup to use? if self.KW_VO in self.kwargs and self.kwargs[self.KW_VO]: self.vo = str(self.kwargs[self.KW_VO]) else: self.vo = gConfig.getValue("/DIRAC/VirtualOrganization", "unknown") return S_OK() def __discoverURL(self): #Calculate final URL try: result = self.__findServiceURL() except Exception, e: return S_ERROR(str(e)) if not result['OK']: return result self.serviceURL = result['Value'] retVal = Network.splitURL(self.serviceURL) if not retVal['OK']: return S_ERROR("URL is malformed: %s" % retVal['Message']) self.__URLTuple = retVal['Value'] self._serviceName = self.__URLTuple[-1] return S_OK()
def __call__(self): """request processing""" self.log.debug("about to execute request") if not self.rmsMonitoring: gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient( useCertificates=True) res = monitorServer.getJobSummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as e: error = str(e) self.log.exception("hit by exception:", "%s" % error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if not self.rmsMonitoring: gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
class BaseClient: VAL_EXTRA_CREDENTIALS_HOST = "hosts" KW_USE_CERTIFICATES = "useCertificates" KW_EXTRA_CREDENTIALS = "extraCredentials" KW_TIMEOUT = "timeout" KW_SETUP = "setup" KW_VO = "VO" KW_DELEGATED_DN = "delegatedDN" KW_DELEGATED_GROUP = "delegatedGroup" KW_IGNORE_GATEWAYS = "ignoreGateways" KW_PROXY_LOCATION = "proxyLocation" KW_PROXY_STRING = "proxyString" KW_PROXY_CHAIN = "proxyChain" KW_SKIP_CA_CHECK = "skipCACheck" KW_KEEP_ALIVE_LAPSE = "keepAliveLapse" __threadConfig = ThreadConfig() def __init__( self, serviceName, **kwargs ): if type( serviceName ) not in types.StringTypes: raise TypeError( "Service name expected to be a string. Received %s type %s" % ( str( serviceName ), type( serviceName ) ) ) self._destinationSrv = serviceName self._serviceName = serviceName self.kwargs = kwargs self.__initStatus = S_OK() self.__idDict = {} self.__extraCredentials = "" self.__enableThreadCheck = False self.__retry = 0 self.__retryDelay = 0 self.__nbOfUrls = 1 #by default we always have 1 url for example: RPCClient('dips://volhcb38.cern.ch:9162/Framework/SystemAdministrator') self.__nbOfRetry = 3 # by default we try try times self.__bannedUrls = [] for initFunc in ( self.__discoverSetup, self.__discoverVO, self.__discoverTimeout, self.__discoverURL, self.__discoverCredentialsToUse, self.__checkTransportSanity, self.__setKeepAliveLapse ): result = initFunc() if not result[ 'OK' ] and self.__initStatus[ 'OK' ]: self.__initStatus = result self._initialize() #HACK for thread-safety: self.__allowedThreadID = False def _initialize( self ): pass def getDestinationService( self ): return self._destinationSrv def getServiceName( self ): return self._serviceName def __discoverSetup( self ): #Which setup to use? if self.KW_SETUP in self.kwargs and self.kwargs[ self.KW_SETUP ]: self.setup = str( self.kwargs[ self.KW_SETUP ] ) else: self.setup = self.__threadConfig.getSetup() if not self.setup: self.setup = gConfig.getValue( "/DIRAC/Setup", "Test" ) return S_OK() def __discoverVO( self ): #Which setup to use? if self.KW_VO in self.kwargs and self.kwargs[ self.KW_VO ]: self.vo = str( self.kwargs[ self.KW_VO ] ) else: self.vo = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown" ) return S_OK() def __discoverURL( self ): #Calculate final URL try: result = self.__findServiceURL() except Exception, e: return S_ERROR( str( e ) ) if not result[ 'OK' ]: return result self.serviceURL = result[ 'Value' ] retVal = Network.splitURL( self.serviceURL ) if not retVal[ 'OK' ]: return S_ERROR( "URL is malformed: %s" % retVal[ 'Message' ] ) self.__URLTuple = retVal[ 'Value' ] self._serviceName = self.__URLTuple[-1] res = gConfig.getOptionsDict( "/DIRAC/ConnConf/%s:%s" % self.__URLTuple[1:3] ) if res[ 'OK' ]: opts = res[ 'Value' ] for k in opts: if k not in self.kwargs: self.kwargs[k] = opts[k] return S_OK()
def __findServiceURL(self): """ Discovers the URL of a service, taking into account gateways, multiple URLs, banned URLs If the site on which we run is configured to use gateways (/DIRAC/Gateways/<siteName>), these URLs will be used. To ignore the gateway, it is possible to set KW_IGNORE_GATEWAYS to False in kwargs. If self._destinationSrv (given as constructor attribute) is a properly formed URL, we just return this one. If we have to use a gateway, we just replace the server name in the url. The list of URLs defined in the CS (<System>/URLs/<Component>) is randomized This method also sets some attributes: * self.__nbOfUrls = number of URLs * self.__nbOfRetry removed in HTTPS (Managed by requests) * self.__bannedUrls is reinitialized if all the URLs are banned :return: the selected URL WARNING (Mostly) COPY PASTE FROM BaseClient (protocols list is changed to https) """ if not self.__initStatus["OK"]: return self.__initStatus # Load the Gateways URLs for the current site Name gatewayURL = False if not self.kwargs.get(self.KW_IGNORE_GATEWAYS): gatewayURLs = getGatewayURLs() if gatewayURLs: gatewayURL = "/".join(gatewayURLs[0].split("/")[:3]) # If what was given as constructor attribute is a properly formed URL, # we just return this one. # If we have to use a gateway, we just replace the server name in it if self._destinationSrv.startswith("https://"): gLogger.debug("Already given a valid url", self._destinationSrv) if not gatewayURL: return S_OK(self._destinationSrv) gLogger.debug("Reconstructing given URL to pass through gateway") path = "/".join(self._destinationSrv.split("/")[3:]) finalURL = "%s/%s" % (gatewayURL, path) gLogger.debug("Gateway URL conversion:\n %s -> %s" % (self._destinationSrv, finalURL)) return S_OK(finalURL) if gatewayURL: gLogger.debug("Using gateway", gatewayURL) return S_OK("%s/%s" % (gatewayURL, self._destinationSrv)) # If nor url is given as constructor, we extract the list of URLs from the CS (System/URLs/Component) try: # We randomize the list, and add at the end the failover URLs (System/FailoverURLs/Component) urlsList = getServiceURLs(self._destinationSrv, setup=self.setup, failover=True) except Exception as e: return S_ERROR("Cannot get URL for %s in setup %s: %s" % (self._destinationSrv, self.setup, repr(e))) if not urlsList: return S_ERROR("URL for service %s not found" % self._destinationSrv) self.__nbOfUrls = len(urlsList) # __nbOfRetry removed in HTTPS (managed by requests) if self.__nbOfUrls == len(self.__bannedUrls): self.__bannedUrls = [] # retry all urls gLogger.debug("Retrying again all URLs") if self.__bannedUrls and len(urlsList) > 1: # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug("Removing banned URL", "%s" % i) urlsList.remove(i) sURL = urlsList[0] # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl if self.__bannedUrls and self.__nbOfUrls > 2: # when we have multiple services then we can # have a situation when two services are running on the same machine with different ports... retVal = Network.splitURL(sURL) nexturl = None if retVal["OK"]: nexturl = retVal["Value"] found = False for i in self.__bannedUrls: retVal = Network.splitURL(i) if retVal["OK"]: bannedurl = retVal["Value"] else: break # We found a banned URL on the same host as the one we are running on if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl(nexturl, urlsList[1:]) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug("Discovering URL for service", "%s -> %s" % (self._destinationSrv, sURL)) return S_OK(sURL)
def __generateUniqueClientName(self): hashStr = ":".join( (str(datetime.datetime.utcnow()), str(random.random()), Network.getFQDN(), gLogger.getName()) ) hexHash = md5(hashStr.encode()).hexdigest() return hexHash
def __findServiceURL( self ): if not self.__initStatus[ 'OK' ]: return self.__initStatus gatewayURL = False if self.KW_IGNORE_GATEWAYS not in self.kwargs or not self.kwargs[ self.KW_IGNORE_GATEWAYS ]: dRetVal = gConfig.getOption( "/DIRAC/Gateways/%s" % DIRAC.siteName() ) if dRetVal[ 'OK' ]: rawGatewayURL = List.randomize( List.fromChar( dRetVal[ 'Value'], "," ) )[0] gatewayURL = "/".join( rawGatewayURL.split( "/" )[:3] ) for protocol in gProtocolDict.keys(): if self._destinationSrv.find( "%s://" % protocol ) == 0: gLogger.debug( "Already given a valid url", self._destinationSrv ) if not gatewayURL: return S_OK( self._destinationSrv ) gLogger.debug( "Reconstructing given URL to pass through gateway" ) path = "/".join( self._destinationSrv.split( "/" )[3:] ) finalURL = "%s/%s" % ( gatewayURL, path ) gLogger.debug( "Gateway URL conversion:\n %s -> %s" % ( self._destinationSrv, finalURL ) ) return S_OK( finalURL ) if gatewayURL: gLogger.debug( "Using gateway", gatewayURL ) return S_OK( "%s/%s" % ( gatewayURL, self._destinationSrv ) ) try: urls = getServiceURL( self._destinationSrv, setup = self.setup ) except Exception as e: return S_ERROR( "Cannot get URL for %s in setup %s: %s" % ( self._destinationSrv, self.setup, repr( e ) ) ) if not urls: return S_ERROR( "URL for service %s not found" % self._destinationSrv ) urlsList = List.fromChar( urls, "," ) self.__nbOfUrls = len( urlsList ) self.__nbOfRetry = 2 if self.__nbOfUrls > 2 else 3 # we retry 2 times all services, if we run more than 2 services if len( urlsList ) == len( self.__bannedUrls ): self.__bannedUrls = [] # retry all urls gLogger.debug( "Retrying again all URLs" ) if len( self.__bannedUrls ) > 0 and len( urlsList ) > 1 : # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug( "Removing banned URL", "%s" % i ) urlsList.remove( i ) randUrls = List.randomize( urlsList ) sURL = randUrls[0] if len( self.__bannedUrls ) > 0 and self.__nbOfUrls > 2: # when we have multiple services then we can have a situation # when two service are running on the same machine with different port... retVal = Network.splitURL( sURL ) nexturl = None if retVal['OK']: nexturl = retVal['Value'] found = False for i in self.__bannedUrls: retVal = Network.splitURL( i ) if retVal['OK']: bannedurl = retVal['Value'] else: break if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl( nexturl, randUrls[1:] ) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug( "Discovering URL for service", "%s -> %s" % ( self._destinationSrv, sURL ) ) return S_OK( sURL )
def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result['OK']: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties['fullName']) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties['cyclesDone'] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() cpuStats = self._startReportToMonitoring() cycleResult = self.__executeModuleCycle() if cpuStats: self._endReportToMonitoring(*cpuStats) # Increment counters self.__moduleProperties['cyclesDone'] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties['totalElapsedTime'] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties['fullName']) self.log.notice(" Executed %s times previously" % self.__moduleProperties['cyclesDone']) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ 'totalElapsedTime'] / self.__moduleProperties['cyclesDone'] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( 'PollingTime') self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult['OK']: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # component and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': "agent", 'component': "_".join(self.__moduleProperties['fullName'].split("/")), 'cycleDuration': elapsedTime, 'cycles': 1 }) else: self.log.warn(" Cycle had an error:", cycleResult['Message']) self.log.notice("-" * 40) # Update number of cycles if not self.activityMonitoring: self.monitor.setComponentExtraParam( 'cycles', self.__moduleProperties['cyclesDone']) # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult
def getHostname(self): hostname = self.getOption("/DIRAC/Hostname") if not hostname: return Network.getFQDN() return hostname
def __findServiceURL(self): """ Discovers the URL of a service, taking into account gateways, multiple URLs, banned URLs If the site on which we run is configured to use gateways (/DIRAC/Gateways/<siteName>), these URLs will be used. To ignore the gateway, it is possible to set KW_IGNORE_GATEWAYS to False in kwargs. If self._destinationSrv (given as constructor attribute) is a properly formed URL, we just return this one. If we have to use a gateway, we just replace the server name in the url. The list of URLs defined in the CS (<System>/URLs/<Component>) is randomized This method also sets some attributes: * self.__nbOfUrls = number of URLs * self.__nbOfRetry = 2 if we have more than 2 urls, otherwise 3 * self.__bannedUrls is reinitialized if all the URLs are banned :return: the selected URL """ if not self.__initStatus['OK']: return self.__initStatus # Load the Gateways URLs for the current site Name gatewayURL = False if self.KW_IGNORE_GATEWAYS not in self.kwargs or not self.kwargs[self.KW_IGNORE_GATEWAYS]: dRetVal = gConfig.getOption("/DIRAC/Gateways/%s" % DIRAC.siteName()) if dRetVal['OK']: rawGatewayURL = List.randomize(List.fromChar(dRetVal['Value'], ","))[0] gatewayURL = "/".join(rawGatewayURL.split("/")[:3]) # If what was given as constructor attribute is a properly formed URL, # we just return this one. # If we have to use a gateway, we just replace the server name in it for protocol in gProtocolDict: if self._destinationSrv.find("%s://" % protocol) == 0: gLogger.debug("Already given a valid url", self._destinationSrv) if not gatewayURL: return S_OK(self._destinationSrv) gLogger.debug("Reconstructing given URL to pass through gateway") path = "/".join(self._destinationSrv.split("/")[3:]) finalURL = "%s/%s" % (gatewayURL, path) gLogger.debug("Gateway URL conversion:\n %s -> %s" % (self._destinationSrv, finalURL)) return S_OK(finalURL) if gatewayURL: gLogger.debug("Using gateway", gatewayURL) return S_OK("%s/%s" % (gatewayURL, self._destinationSrv)) # We extract the list of URLs from the CS (System/URLs/Component) try: urls = getServiceURL(self._destinationSrv, setup=self.setup) except Exception as e: return S_ERROR("Cannot get URL for %s in setup %s: %s" % (self._destinationSrv, self.setup, repr(e))) if not urls: return S_ERROR("URL for service %s not found" % self._destinationSrv) failoverUrls = [] # Try if there are some failover URLs to use as last resort try: failoverUrlsStr = getServiceFailoverURL(self._destinationSrv, setup=self.setup) if failoverUrlsStr: failoverUrls = failoverUrlsStr.split(',') except Exception as e: pass # We randomize the list, and add at the end the failover URLs (System/FailoverURLs/Component) urlsList = List.randomize(List.fromChar(urls, ",")) + failoverUrls self.__nbOfUrls = len(urlsList) self.__nbOfRetry = 2 if self.__nbOfUrls > 2 else 3 # we retry 2 times all services, if we run more than 2 services if self.__nbOfUrls == len(self.__bannedUrls): self.__bannedUrls = [] # retry all urls gLogger.debug("Retrying again all URLs") if len(self.__bannedUrls) > 0 and len(urlsList) > 1: # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug("Removing banned URL", "%s" % i) urlsList.remove(i) # Take the first URL from the list #randUrls = List.randomize( urlsList ) + failoverUrls sURL = urlsList[0] # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl if len(self.__bannedUrls) > 0 and self.__nbOfUrls > 2: # when we have multiple services then we can # have a situation when two services are running on the same machine with different ports... retVal = Network.splitURL(sURL) nexturl = None if retVal['OK']: nexturl = retVal['Value'] found = False for i in self.__bannedUrls: retVal = Network.splitURL(i) if retVal['OK']: bannedurl = retVal['Value'] else: break # We found a banned URL on the same host as the one we are running on if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl(nexturl, urlsList[1:]) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug("Discovering URL for service", "%s -> %s" % (self._destinationSrv, sURL)) return S_OK(sURL)
def getHostname( self ): hostname = self.getOption( "/DIRAC/Hostname" ) if not hostname: return Network.getFQDN() return hostname
def __findServiceURL(self): """ Discovers the URL of a service, taking into account gateways, multiple URLs, banned URLs If the site on which we run is configured to use gateways (/DIRAC/Gateways/<siteName>), these URLs will be used. To ignore the gateway, it is possible to set KW_IGNORE_GATEWAYS to False in kwargs. If self._destinationSrv (given as constructor attribute) is a properly formed URL, we just return this one. If we have to use a gateway, we just replace the server name in the url. The list of URLs defined in the CS (<System>/URLs/<Component>) is randomized This method also sets some attributes: * self.__nbOfUrls = number of URLs * self.__nbOfRetry = 2 if we have more than 2 urls, otherwise 3 * self.__bannedUrls is reinitialized if all the URLs are banned :return: S_OK(str)/S_ERROR() -- the selected URL """ if not self.__initStatus['OK']: return self.__initStatus # Load the Gateways URLs for the current site Name gatewayURL = False if not self.kwargs.get(self.KW_IGNORE_GATEWAYS): dRetVal = gConfig.getOption("/DIRAC/Gateways/%s" % DIRAC.siteName()) if dRetVal['OK']: rawGatewayURL = List.randomize(List.fromChar(dRetVal['Value'], ","))[0] gatewayURL = "/".join(rawGatewayURL.split("/")[:3]) # If what was given as constructor attribute is a properly formed URL, # we just return this one. # If we have to use a gateway, we just replace the server name in it for protocol in gProtocolDict: if self._destinationSrv.find("%s://" % protocol) == 0: gLogger.debug("Already given a valid url", self._destinationSrv) if not gatewayURL: return S_OK(self._destinationSrv) gLogger.debug("Reconstructing given URL to pass through gateway") path = "/".join(self._destinationSrv.split("/")[3:]) finalURL = "%s/%s" % (gatewayURL, path) gLogger.debug("Gateway URL conversion:\n %s -> %s" % (self._destinationSrv, finalURL)) return S_OK(finalURL) if gatewayURL: gLogger.debug("Using gateway", gatewayURL) return S_OK("%s/%s" % (gatewayURL, self._destinationSrv)) # We extract the list of URLs from the CS (System/URLs/Component) try: urls = getServiceURL(self._destinationSrv, setup=self.setup) except Exception as e: return S_ERROR("Cannot get URL for %s in setup %s: %s" % (self._destinationSrv, self.setup, repr(e))) if not urls: return S_ERROR("URL for service %s not found" % self._destinationSrv) failoverUrls = [] # Try if there are some failover URLs to use as last resort try: failoverUrlsStr = getServiceFailoverURL(self._destinationSrv, setup=self.setup) if failoverUrlsStr: failoverUrls = failoverUrlsStr.split(',') except Exception as e: pass # We randomize the list, and add at the end the failover URLs (System/FailoverURLs/Component) urlsList = List.randomize(List.fromChar(urls, ",")) + failoverUrls self.__nbOfUrls = len(urlsList) self.__nbOfRetry = 2 if self.__nbOfUrls > 2 else 3 # we retry 2 times all services, if we run more than 2 services if self.__nbOfUrls == len(self.__bannedUrls): self.__bannedUrls = [] # retry all urls gLogger.debug("Retrying again all URLs") if len(self.__bannedUrls) > 0 and len(urlsList) > 1: # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug("Removing banned URL", "%s" % i) urlsList.remove(i) # Take the first URL from the list # randUrls = List.randomize( urlsList ) + failoverUrls sURL = urlsList[0] # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl if len(self.__bannedUrls) > 0 and self.__nbOfUrls > 2: # when we have multiple services then we can # have a situation when two services are running on the same machine with different ports... retVal = Network.splitURL(sURL) nexturl = None if retVal['OK']: nexturl = retVal['Value'] found = False for i in self.__bannedUrls: retVal = Network.splitURL(i) if retVal['OK']: bannedurl = retVal['Value'] else: break # We found a banned URL on the same host as the one we are running on if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl(nexturl, urlsList[1:]) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug("Discovering URL for service", "%s -> %s" % (self._destinationSrv, sURL)) return S_OK(sURL)
def getVMIP(self): netData = Network.discoverInterfaces() ip = netData['eth0']['ip'] return ip
def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" if not self.__rmsMonitoring: gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) else: gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK()
def __generateUniqueClientName(self): hashStr = ":".join((Time.toString(), str(random.random()), Network.getFQDN(), gLogger.getName())) hexHash = md5(hashStr).hexdigest() return hexHash
def __findServiceURL(self): if not self.__initStatus['OK']: return self.__initStatus gatewayURL = False if self.KW_IGNORE_GATEWAYS not in self.kwargs or not self.kwargs[ self.KW_IGNORE_GATEWAYS]: dRetVal = gConfig.getOption("/DIRAC/Gateways/%s" % DIRAC.siteName()) if dRetVal['OK']: rawGatewayURL = List.randomize( List.fromChar(dRetVal['Value'], ","))[0] gatewayURL = "/".join(rawGatewayURL.split("/")[:3]) for protocol in gProtocolDict.keys(): if self._destinationSrv.find("%s://" % protocol) == 0: gLogger.debug("Already given a valid url", self._destinationSrv) if not gatewayURL: return S_OK(self._destinationSrv) gLogger.debug( "Reconstructing given URL to pass through gateway") path = "/".join(self._destinationSrv.split("/")[3:]) finalURL = "%s/%s" % (gatewayURL, path) gLogger.debug("Gateway URL conversion:\n %s -> %s" % (self._destinationSrv, finalURL)) return S_OK(finalURL) if gatewayURL: gLogger.debug("Using gateway", gatewayURL) return S_OK("%s/%s" % (gatewayURL, self._destinationSrv)) try: urls = getServiceURL(self._destinationSrv, setup=self.setup) except Exception as e: return S_ERROR("Cannot get URL for %s in setup %s: %s" % (self._destinationSrv, self.setup, repr(e))) if not urls: return S_ERROR("URL for service %s not found" % self._destinationSrv) urlsList = List.fromChar(urls, ",") self.__nbOfUrls = len(urlsList) self.__nbOfRetry = 2 if self.__nbOfUrls > 2 else 3 # we retry 2 times all services, if we run more than 2 services if len(urlsList) == len(self.__bannedUrls): self.__bannedUrls = [] # retry all urls gLogger.debug("Retrying again all URLs") if len(self.__bannedUrls) > 0 and len(urlsList) > 1: # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug("Removing banned URL", "%s" % i) urlsList.remove(i) randUrls = List.randomize(urlsList) sURL = randUrls[0] if len( self.__bannedUrls ) > 0 and self.__nbOfUrls > 2: # when we have multiple services then we can have a situation # when two service are running on the same machine with different port... retVal = Network.splitURL(sURL) nexturl = None if retVal['OK']: nexturl = retVal['Value'] found = False for i in self.__bannedUrls: retVal = Network.splitURL(i) if retVal['OK']: bannedurl = retVal['Value'] else: break if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl(nexturl, randUrls[1:]) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug("Discovering URL for service", "%s -> %s" % (self._destinationSrv, sURL)) return S_OK(sURL)