def do_start(self, args):
        """ Start services or agents or database server

        usage:

          start <system|*> <service|agent|*>
          start mysql
    """
        argss = args.split()
        if len(argss) < 2:
            print self.do_start.__doc__
            return
        option = argss[0]
        del argss[0]

        if option != 'mysql':
            if len(argss) < 1:
                print self.do_start.__doc__
                return
            system = option
            if system != '*':
                component = argss[0]
            else:
                component = '*'
            client = SystemAdministratorClient(self.host, self.port)
            result = client.startComponent(system, component)
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                if system != '*' and component != '*':
                    print "\n%s_%s started successfully, runit status:\n" % (
                        system, component)
                else:
                    print "\nComponents started successfully, runit status:\n"
                for comp in result['Value']:
                    print comp.rjust(
                        32), ':', result['Value'][comp]['RunitStatus']
        else:
            print "Not yet implemented"
Пример #2
0
  def do_start( self, args ):
    """ Start services or agents or database server

        usage:

          start <system|*> <service|agent|*>
          start mysql
    """
    argss = args.split()
    if len( argss ) < 2:
      print self.do_start.__doc__
      return
    option = argss[0]
    del argss[0]

    if option != 'mysql':
      if len( argss ) < 1:
        print self.do_start.__doc__
        return
      system = option
      if system != '*':
        component = argss[0]
      else:
        component = '*'
      client = SystemAdministratorClient( self.host, self.port )
      result = client.startComponent( system, component )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        if system != '*' and component != '*':
          print "\n%s_%s started successfully, runit status:\n" % ( system, component )
        else:
          print "\nComponents started successfully, runit status:\n"
        for comp in result['Value']:
          print comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus']
    else:
      print "Not yet implemented"
Пример #3
0
  def __componentAction( self , action = None ):

    """
    Actions which should be done on components. The only parameters is an action
    to perform.
    Returns standard JSON response structure with with service response
    or error messages
    """

    DN = getUserDN()
    group = getSelectedGroup()

    if ( not action ) or ( not len( action ) > 0 ):
      error = "Action is not defined or has zero length"
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }

    if action not in [ "restart" , "start" , "stop" , "uninstall" ]:
      error = "The request parameters action '%s' is unknown" % action
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }
    self.action = action

    result = dict()
    for i in request.params:
      if i == "action":
        continue

      target = i.split( " @ " , 1 )
      if not len( target ) == 2:
        continue

      system = request.params[ i ]
      gLogger.always( "System: %s" % system )
      host = target[ 1 ]
      gLogger.always( "Host: %s" % host )
      component = target[ 0 ]
      gLogger.always( "Component: %s" % component )
      if not host in result:
        result[ host ] = list()
      result[ host ].append( [ system , component ] )

    if not len( result ) > 0:
      error = "Failed to get component(s) for %s" % action
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }
      
    gLogger.always( result )
    self.actionSuccess = list()
    self.actionFailed = list()

    for hostname in result.keys():

      if not len( result[ hostname ] ) > 0:
        continue

      client = SystemAdministratorClient( hostname , None , delegatedDN=DN ,
                                          delegatedGroup=group )

      for i in result[ hostname ]:

        system = i[ 0 ]
        component = i[ 1 ]

        try:
          if action == "restart":
            result = client.restartComponent( system , component )
          elif action == "start":
            result = client.startComponent( system , component )
          elif action == "stop":
            result = client.stopComponent( system , component )
          elif action == "uninstall":
            result = client.uninstallComponent( system , component )
          else:
            result = list()
            result[ "Message" ] = "Action %s is not valid" % action
        except Exception, x:
          result = list()
          result[ "Message" ] = "Exception: %s" % str( x )
        gLogger.debug( "Result: %s" % result )

        if not result[ "OK" ]:
          error = hostname + ": " + result[ "Message" ]
          self.actionFailed.append( error )
          gLogger.error( "Failure during component %s: %s" % ( action , error ) )
        else:
          gLogger.always( "Successfully %s component %s" % ( action , component ) )
          self.actionSuccess.append( component )
Пример #4
0
class MonitorAgents(AgentModule):
  """MonitorAgents class."""

  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()

  def logError(self, errStr, varMsg=''):
    """Append errors to a list, which is sent in email notification."""
    self.log.error(errStr, varMsg)
    self.errors.append(errStr + " " + varMsg)

  def beginExecution(self):
    """Reload the configurations before every cycle."""
    self.setup = self.am_getOption("Setup", self.setup)
    self.enabled = self.am_getOption("EnableFlag", self.enabled)
    self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents)
    self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors)
    self.restartServices = self.am_getOption("RestartServices", self.restartServices)
    self.diracLocation = os.environ.get("DIRAC", self.diracLocation)
    self.addressTo = self.am_getOption('MailTo', self.addressTo)
    self.addressFrom = self.am_getOption('MailFrom', self.addressFrom)
    self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents)
    self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs)

    self.csAPI = CSAPI()

    res = self.getRunningInstances(instanceType='Agents')
    if not res["OK"]:
      return S_ERROR("Failure to get running agents")
    self.agents = res["Value"]

    res = self.getRunningInstances(instanceType='Executors')
    if not res["OK"]:
      return S_ERROR("Failure to get running executors")
    self.executors = res["Value"]

    res = self.getRunningInstances(instanceType='Services')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]

    self.accounting.clear()
    return S_OK()

  def sendNotification(self):
    """Send email notification about changes done in the last cycle."""
    if not(self.errors or self.accounting):
      return S_OK()

    emailBody = ""
    rows = []
    for instanceName, val in self.accounting.iteritems():
      rows.append([[instanceName],
                   [val.get('Treatment', 'No Treatment')],
                   [str(val.get('LogAge', 'Not Relevant'))]])

    if rows:
      columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
      emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ')

    if self.errors:
      emailBody += "\n\nErrors:"
      emailBody += "\n".join(self.errors)

    self.log.notice("Sending Email:\n" + emailBody)
    for address in self.addressTo:
      res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False)
      if not res['OK']:
        self.log.error("Failure to send Email notification to ", address)
        continue

    self.errors = []
    self.accounting.clear()

    return S_OK()

  def getRunningInstances(self, instanceType='Agents', runitStatus='Run'):
    """Return a dict of running agents, executors or services.

    Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

    :param str instanceType: 'Agents', 'Executors', 'Services'
    :param str runitStatus: Return only those instances with given RunitStatus or 'All'
    :returns: Dictionary of running instances
    """
    res = self.sysAdminClient.getOverallStatus()
    if not res["OK"]:
      self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"])
      return res

    val = res['Value'][instanceType]
    runningAgents = defaultdict(dict)
    for system, agents in val.iteritems():
      for agentName, agentInfo in agents.iteritems():
        if agentInfo['Setup'] and agentInfo['Installed']:
          if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus:
            continue
          confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName)
          for option, default in (('PollingTime', HOUR), ('Port', None)):
            optPath = os.path.join(confPath, option)
            runningAgents[agentName][option] = gConfig.getValue(optPath, default)
          runningAgents[agentName]["LogFileLocation"] = \
              os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current')
          runningAgents[agentName]["PID"] = agentInfo["PID"]
          runningAgents[agentName]['Module'] = agentInfo['Module']
          runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus']
          runningAgents[agentName]['System'] = system

    return S_OK(runningAgents)

  def on_terminate(self, agentName, process):
    """Execute callback when a process terminates gracefully."""
    self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid))

  def execute(self):
    """Execute checks for agents, executors, services."""
    for instanceType in ('executor', 'agent', 'service'):
      for name, options in getattr(self, instanceType + 's').iteritems():
        # call checkAgent, checkExecutor, checkService
        res = getattr(self, 'check' + instanceType.capitalize())(name, options)
        if not res['OK']:
          self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message']))

    res = self.componentControl()
    if not res['OK']:
      if "Stopped does not exist" not in res['Message'] and \
         "Running does not exist" not in res['Message']:
        self.logError("Failure to control components", res['Message'])

    if not self.errors:
      res = self.checkURLs()
      if not res['OK']:
        self.logError("Failure to check URLs", res['Message'])
    else:
      self.logError('Something was wrong before, not checking URLs this time')

    self.sendNotification()

    if self.errors:
      return S_ERROR("Error during this cycle, check log")

    return S_OK()

  @staticmethod
  def getLastAccessTime(logFileLocation):
    """Return the age of log file."""
    lastAccessTime = 0
    try:
      lastAccessTime = os.path.getmtime(logFileLocation)
      lastAccessTime = datetime.fromtimestamp(lastAccessTime)
    except OSError as e:
      return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e))

    now = datetime.now()
    age = now - lastAccessTime
    return S_OK(age)

  def restartInstance(self, pid, instanceName, enabled):
    """Kill a process which is then restarted automatically."""
    if not (self.enabled and enabled):
      self.log.info("Restarting is disabled, please restart %s manually" % instanceName)
      self.accounting[instanceName]["Treatment"] = "Please restart it manually"
      return S_OK(NO_RESTART)

    try:
      agentProc = psutil.Process(int(pid))
      processesToTerminate = agentProc.children(recursive=True)
      processesToTerminate.append(agentProc)

      for proc in processesToTerminate:
        proc.terminate()

      _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5,
                                       callback=partial(self.on_terminate, instanceName))
      for proc in alive:
        self.log.info("Forcefully killing process %s" % proc.pid)
        proc.kill()

      return S_OK()

    except psutil.Error as err:
      self.logError("Exception occurred in terminating processes", "%s" % err)
      return S_ERROR()

  def checkService(self, serviceName, options):
    """Ping the service, restart if the ping does not respond."""
    url = self._getURL(serviceName, options)
    self.log.info("Pinging service", url)
    pingRes = Client().ping(url=url)
    if not pingRes['OK']:
      self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message']))
      res = self.restartInstance(int(options['PID']), serviceName, self.restartServices)
      if not res["OK"]:
        return res
      elif res['OK'] and res['Value'] != NO_RESTART:
        self.accounting[serviceName]["Treatment"] = "Successfully Restarted"
        self.log.info("Agent %s has been successfully restarted" % serviceName)
    self.log.info("Service responded OK")
    return S_OK()

  def checkAgent(self, agentName, options):
    """Check the age of agent's log file, if it is too old then restart the agent."""
    pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID']
    self.log.info("Checking Agent: %s" % agentName)
    self.log.info("Polling Time: %s" % pollingTime)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES)))

    maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
    if age.seconds < maxLogAge:
      return S_OK()

    self.log.info("Current log file is too old for Agent %s" % agentName)
    self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

    res = self.restartInstance(int(pid), agentName, self.restartAgents)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[agentName]["Treatment"] = "Successfully Restarted"
      self.log.info("Agent %s has been successfully restarted" % agentName)

    return S_OK()

  def checkExecutor(self, executor, options):
    """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
    currentLogLocation = options['LogFileLocation']
    pid = options['PID']
    self.log.info("Checking executor: %s" % executor)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES)))

    if age.seconds < 2 * HOUR:
      return S_OK()

    self.log.info("Current log file is too old for Executor %s" % executor)
    self.accounting[executor]["LogAge"] = age.seconds / MINUTES

    res = self.checkForCheckingJobs(executor)
    if not res['OK']:
      return res
    if res['OK'] and res['Value'] == NO_CHECKING_JOBS:
      self.accounting.pop(executor, None)
      return S_OK(NO_RESTART)

    res = self.restartInstance(int(pid), executor, self.restartExecutors)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[executor]["Treatment"] = "Successfully Restarted"
      self.log.info("Executor %s has been successfully restarted" % executor)

    return S_OK()

  def checkForCheckingJobs(self, executorName):
    """Check if there are checking jobs with the **executorName** as current MinorStatus."""
    attrDict = {'Status': 'Checking', 'MinorStatus': executorName}

    # returns list of jobs IDs
    resJobs = self.jobMonClient.getJobs(attrDict)
    if not resJobs['OK']:
      self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message']))
      return resJobs
    if resJobs['Value']:
      self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName))
      return S_OK(CHECKING_JOBS)
    self.log.info("Found no jobs in 'Checking' status for %s" % executorName)
    return S_OK(NO_CHECKING_JOBS)

  def componentControl(self):
    """Monitor and control component status as defined in the CS.

    Check for running and stopped components and ensure they have the proper status as defined in the CS
    Registry/Hosts/_HOST_/[Running|Stopped] sections

    :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
       :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
    """
    # get the current status of the components

    resCurrent = self._getCurrentComponentStatus()
    if not resCurrent['OK']:
      return resCurrent
    currentStatus = resCurrent['Value']

    resDefault = self._getDefaultComponentStatus()
    if not resDefault['OK']:
      return resDefault
    defaultStatus = resDefault['Value']

    # ensure instances are in the right state
    shouldBe = {}
    shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down'])
    shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run'])
    shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All'])

    self._ensureComponentRunning(shouldBe['Run'])
    self._ensureComponentDown(shouldBe['Down'])

    for instance in shouldBe['Unknown']:
      self.logError("Unknown instance", "%r, either uninstall or add to config" % instance)

    return S_OK()

  def _getCurrentComponentStatus(self):
    """Get current status for components."""
    resOverall = self.sysAdminClient.getOverallStatus()
    if not resOverall['OK']:
      return resOverall
    currentStatus = {'Down': set(), 'Run': set(), 'All': set()}
    informationDict = resOverall['Value']
    for systemsDict in informationDict.values():
      for system, instancesDict in systemsDict.items():
        for instanceName, instanceInfoDict in instancesDict.items():
          identifier = '%s__%s' % (system, instanceName)
          runitStatus = instanceInfoDict.get('RunitStatus')
          if runitStatus in ('Run', 'Down'):
            currentStatus[runitStatus].add(identifier)

    currentStatus['All'] = currentStatus['Run'] | currentStatus['Down']
    return S_OK(currentStatus)

  def _getDefaultComponentStatus(self):
    """Get the configured status of the components."""
    host = socket.gethostname()
    defaultStatus = {'Down': set(), 'Run': set(), 'All': set()}
    resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running'))
    resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped'))
    if not resRunning['OK']:
      return resRunning
    if not resStopped['OK']:
      return resStopped
    defaultStatus['Run'] = set(resRunning['Value'].keys())
    defaultStatus['Down'] = set(resStopped['Value'].keys())
    defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down']

    if defaultStatus['Run'].intersection(defaultStatus['Down']):
      self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down'])))
      return S_ERROR("Bad host configuration")

    return S_OK(defaultStatus)

  def _ensureComponentRunning(self, shouldBeRunning):
    """Ensure the correct components are running."""
    for instance in shouldBeRunning:
      self.log.info("Starting instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.startComponent(system, name)
        if not res['OK']:
          self.logError("Failed to start component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was down, started instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is down, should be started"

  def _ensureComponentDown(self, shouldBeDown):
    """Ensure the correct components are not running."""
    for instance in shouldBeDown:
      self.log.info("Stopping instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.stopComponent(system, name)
        if not res['OK']:
          self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was running, stopped instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is running, should be stopped"

  def checkURLs(self):
    """Ensure that the running services have their URL in the Config."""
    self.log.info("Checking URLs")
    # get services again, in case they were started/stop in controlComponents
    gConfig.forceRefresh(fromMaster=True)
    res = self.getRunningInstances(instanceType='Services', runitStatus='All')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]
    for service, options in self.services.iteritems():
      self.log.debug("Checking URL for %s with options %s" % (service, options))
      # ignore SystemAdministrator, does not have URLs
      if 'SystemAdministrator' in service:
        continue
      self._checkServiceURL(service, options)

    if self.csAPI.csModified and self.commitURLs:
      self.log.info("Commiting changes to the CS")
      result = self.csAPI.commit()
      if not result['OK']:
        self.logError('Commit to CS failed', result['Message'])
        return S_ERROR("Failed to commit to CS")
    return S_OK()

  def _checkServiceURL(self, serviceName, options):
    """Ensure service URL is properly configured in the CS."""
    url = self._getURL(serviceName, options)
    system = options['System']
    module = options['Module']
    self.log.info("Checking URLs for %s/%s" % (system, module))
    urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module)
    urls = gConfig.getValue(urlsConfigPath, [])
    self.log.debug("Found configured URLs for %s: %s" % (module, urls))
    self.log.debug("This URL is %s" % url)
    runitStatus = options['RunitStatus']
    wouldHave = 'Would have ' if not self.commitURLs else ''
    if runitStatus == 'Run' and url not in urls:
      urls.append(url)
      message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
    if runitStatus == 'Down' and url in urls:
      urls.remove(url)
      message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

  @staticmethod
  def _getURL(serviceName, options):
    """Return URL for the service."""
    system = options['System']
    port = options['Port']
    host = socket.gethostname()
    url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName)
    return url
Пример #5
0
class ComponentSupervisionAgent(AgentModule):
    """ComponentSupervisionAgent class."""
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )

    def logError(self, errStr, varMsg=""):
        """Append errors to a list, which is sent in email notification."""
        self.log.error(errStr, varMsg)
        self.errors.append(errStr + " " + varMsg)

    def beginExecution(self):
        """Reload the configurations before every cycle."""
        self.setup = self.am_getOption("Setup", self.setup)
        self.enabled = self.am_getOption("EnableFlag", self.enabled)
        self.restartAgents = self.am_getOption("RestartAgents",
                                               self.restartAgents)
        self.restartExecutors = self.am_getOption("RestartExecutors",
                                                  self.restartExecutors)
        self.restartServices = self.am_getOption("RestartServices",
                                                 self.restartServices)
        self.addressTo = self.am_getOption("MailTo", self.addressTo)
        self.addressFrom = self.am_getOption("MailFrom", self.addressFrom)
        self.controlComponents = self.am_getOption("ControlComponents",
                                                   self.controlComponents)
        self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs)
        self.doNotRestartInstancePattern = self.am_getOption(
            "DoNotRestartInstancePattern", self.doNotRestartInstancePattern)

        self.csAPI = CSAPI()

        res = self.getRunningInstances(instanceType="Agents")
        if not res["OK"]:
            return S_ERROR("Failure to get running agents")
        self.agents = res["Value"]

        res = self.getRunningInstances(instanceType="Executors")
        if not res["OK"]:
            return S_ERROR("Failure to get running executors")
        self.executors = res["Value"]

        res = self.getRunningInstances(instanceType="Services")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]

        self.accounting.clear()
        return S_OK()

    def sendNotification(self):
        """Send email notification about changes done in the last cycle."""
        if not (self.errors or self.accounting):
            return S_OK()

        emailBody = ""
        rows = []
        for instanceName, val in self.accounting.items():
            rows.append([[instanceName],
                         [val.get("Treatment", "No Treatment")],
                         [str(val.get("LogAge", "Not Relevant"))]])

        if rows:
            columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
            emailBody += printTable(columns,
                                    rows,
                                    printOut=False,
                                    numbering=False,
                                    columnSeparator=" | ")

        if self.errors:
            emailBody += "\n\nErrors:"
            emailBody += "\n".join(self.errors)

        self.log.notice("Sending Email:\n" + emailBody)
        for address in self.addressTo:
            res = self.nClient.sendMail(address,
                                        self.emailSubject,
                                        emailBody,
                                        self.addressFrom,
                                        localAttempt=False)
            if not res["OK"]:
                self.log.error("Failure to send Email notification to ",
                               address)
                continue

        self.errors = []
        self.accounting.clear()

        return S_OK()

    def getRunningInstances(self, instanceType="Agents", runitStatus="Run"):
        """Return a dict of running agents, executors or services.

        Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

        :param str instanceType: 'Agents', 'Executors', 'Services'
        :param str runitStatus: Return only those instances with given RunitStatus or 'All'
        :returns: Dictionary of running instances
        """
        res = self.sysAdminClient.getOverallStatus()
        if not res["OK"]:
            self.logError(
                "Failure to get %s from system administrator client" %
                instanceType, res["Message"])
            return res

        val = res["Value"][instanceType]
        runningComponents = defaultdict(dict)
        for system, components in val.items():
            for componentName, componentInfo in components.items():
                if componentInfo["Setup"] and componentInfo["Installed"]:
                    if runitStatus != "All" and componentInfo[
                            "RunitStatus"] != runitStatus:
                        continue
                    for option, default in (("PollingTime", HOUR),
                                            ("Port", None), ("Protocol",
                                                             None)):
                        runningComponents[componentName][
                            option] = self._getComponentOption(
                                instanceType, system, componentName, option,
                                default)
                        # remove empty values so we can use defaults in _getURL
                        if not runningComponents[componentName][option]:
                            runningComponents[componentName].pop(option)
                    runningComponents[componentName][
                        "LogFileLocation"] = os.path.join(
                            self.diracLocation, "runit", system, componentName,
                            "log", "current")
                    runningComponents[componentName]["PID"] = componentInfo[
                        "PID"]
                    runningComponents[componentName]["Module"] = componentInfo[
                        "Module"]
                    runningComponents[componentName][
                        "RunitStatus"] = componentInfo["RunitStatus"]
                    runningComponents[componentName]["System"] = system

        return S_OK(runningComponents)

    def _getComponentOption(self, instanceType, system, componentName, option,
                            default):
        """Get component option from DIRAC CS, using components' base classes methods."""
        componentPath = PathFinder.getComponentSection(
            system=system,
            component=componentName,
            setup=self.setup,
            componentCategory=instanceType,
        )
        if instanceType != "Agents":
            return gConfig.getValue(Path.cfgPath(componentPath, option),
                                    default)
        # deal with agent configuration
        componentLoadModule = gConfig.getValue(
            Path.cfgPath(componentPath, "Module"), componentName)
        fullComponentName = Path.cfgPath(system, componentName)
        fullComponentLoadName = Path.cfgPath(system, componentLoadModule)
        return AgentModule(fullComponentName,
                           fullComponentLoadName).am_getOption(
                               option, default)

    def on_terminate(self, componentName, process):
        """Execute callback when a process terminates gracefully."""
        self.log.info(
            "%s's process with ID: %s has been terminated successfully" %
            (componentName, process.pid))

    def execute(self):
        """Execute checks for agents, executors, services."""
        for instanceType in ("executor", "agent", "service"):
            for name, options in getattr(self, instanceType + "s").items():
                # call checkAgent, checkExecutor, checkService
                res = getattr(self,
                              "check" + instanceType.capitalize())(name,
                                                                   options)
                if not res["OK"]:
                    self.logError("Failure when checking %s" % instanceType,
                                  "%s, %s" % (name, res["Message"]))

        res = self.componentControl()
        if not res["OK"]:
            if "Stopped does not exist" not in res[
                    "Message"] and "Running does not exist" not in res[
                        "Message"]:
                self.logError("Failure to control components", res["Message"])

        if not self.errors:
            res = self.checkURLs()
            if not res["OK"]:
                self.logError("Failure to check URLs", res["Message"])
        else:
            self.logError(
                "Something was wrong before, not checking URLs this time")

        self.sendNotification()

        if self.errors:
            return S_ERROR("Error during this cycle, check log")

        return S_OK()

    @staticmethod
    def getLastAccessTime(logFileLocation):
        """Return the age of log file."""
        lastAccessTime = 0
        try:
            lastAccessTime = os.path.getmtime(logFileLocation)
            lastAccessTime = datetime.fromtimestamp(lastAccessTime)
        except OSError as e:
            return S_ERROR("Failed to access logfile %s: %r" %
                           (logFileLocation, e))

        now = datetime.now()
        age = now - lastAccessTime
        return S_OK(age)

    def restartInstance(self, pid, instanceName, enabled):
        """Kill a process which is then restarted automatically."""
        if not (self.enabled and enabled):
            self.log.info(
                "Restarting is disabled, please restart %s manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        if any(pattern in instanceName
               for pattern in self.doNotRestartInstancePattern):
            self.log.info(
                "Restarting for %s is disabled, please restart it manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        try:
            componentProc = psutil.Process(int(pid))
            processesToTerminate = componentProc.children(recursive=True)
            processesToTerminate.append(componentProc)

            for proc in processesToTerminate:
                proc.terminate()

            _gone, alive = psutil.wait_procs(processesToTerminate,
                                             timeout=5,
                                             callback=partial(
                                                 self.on_terminate,
                                                 instanceName))
            for proc in alive:
                self.log.info("Forcefully killing process %s" % proc.pid)
                proc.kill()

            return S_OK()

        except psutil.Error as err:
            self.logError("Exception occurred in terminating processes",
                          "%s" % err)
            return S_ERROR()

    def checkService(self, serviceName, options):
        """Ping the service, restart if the ping does not respond."""
        url = self._getURL(serviceName, options)
        self.log.info("Pinging service", url)
        pingRes = Client().ping(url=url)
        if not pingRes["OK"]:
            self.log.info("Failure pinging service: %s: %s" %
                          (url, pingRes["Message"]))
            res = self.restartInstance(int(options["PID"]), serviceName,
                                       self.restartServices)
            if not res["OK"]:
                return res
            if res["Value"] != NO_RESTART:
                self.accounting[serviceName][
                    "Treatment"] = "Successfully Restarted"
                self.log.info("Service %s has been successfully restarted" %
                              serviceName)
        self.log.info("Service responded OK")
        return S_OK()

    def checkAgent(self, agentName, options):
        """Check the age of agent's log file, if it is too old then restart the agent."""
        pollingTime, currentLogLocation, pid = (options["PollingTime"],
                                                options["LogFileLocation"],
                                                options["PID"])
        self.log.info("Checking Agent: %s" % agentName)
        self.log.info("Polling Time: %s" % pollingTime)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (agentName, (age.seconds / MINUTES)))

        maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
        if age.seconds < maxLogAge:
            return S_OK()

        self.log.info("Current log file is too old for Agent %s" % agentName)
        self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

        res = self.restartInstance(int(pid), agentName, self.restartAgents)
        if not res["OK"]:
            return res
        if res["Value"] != NO_RESTART:
            self.accounting[agentName]["Treatment"] = "Successfully Restarted"
            self.log.info("Agent %s has been successfully restarted" %
                          agentName)

        return S_OK()

    def checkExecutor(self, executor, options):
        """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
        currentLogLocation = options["LogFileLocation"]
        pid = options["PID"]
        self.log.info("Checking executor: %s" % executor)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (executor, (age.seconds / MINUTES)))

        if age.seconds < 2 * HOUR:
            return S_OK()

        self.log.info("Current log file is too old for Executor %s" % executor)
        self.accounting[executor]["LogAge"] = age.seconds / MINUTES

        res = self.checkForCheckingJobs(executor)
        if not res["OK"]:
            return res
        if res["OK"] and res["Value"] == NO_CHECKING_JOBS:
            self.accounting.pop(executor, None)
            return S_OK(NO_RESTART)

        res = self.restartInstance(int(pid), executor, self.restartExecutors)
        if not res["OK"]:
            return res
        elif res["OK"] and res["Value"] != NO_RESTART:
            self.accounting[executor]["Treatment"] = "Successfully Restarted"
            self.log.info("Executor %s has been successfully restarted" %
                          executor)

        return S_OK()

    def checkForCheckingJobs(self, executorName):
        """Check if there are checking jobs with the **executorName** as current MinorStatus."""
        attrDict = {"Status": "Checking", "MinorStatus": executorName}

        # returns list of jobs IDs
        resJobs = self.jobMonClient.getJobs(attrDict)
        if not resJobs["OK"]:
            self.logError("Could not get jobs for this executor",
                          "%s: %s" % (executorName, resJobs["Message"]))
            return resJobs
        if resJobs["Value"]:
            self.log.info('Found %d jobs in "Checking" status for %s' %
                          (len(resJobs["Value"]), executorName))
            return S_OK(CHECKING_JOBS)
        self.log.info('Found no jobs in "Checking" status for %s' %
                      executorName)
        return S_OK(NO_CHECKING_JOBS)

    def componentControl(self):
        """Monitor and control component status as defined in the CS.

        Check for running and stopped components and ensure they have the proper status as defined in the CS
        Registry/Hosts/_HOST_/[Running|Stopped] sections

        :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
           :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
        """
        # get the current status of the components

        resCurrent = self._getCurrentComponentStatus()
        if not resCurrent["OK"]:
            return resCurrent
        currentStatus = resCurrent["Value"]

        resDefault = self._getDefaultComponentStatus()
        if not resDefault["OK"]:
            return resDefault
        defaultStatus = resDefault["Value"]

        # ensure instances are in the right state
        shouldBe = {}
        shouldBe["Run"] = defaultStatus["Run"].intersection(
            currentStatus["Down"])
        shouldBe["Down"] = defaultStatus["Down"].intersection(
            currentStatus["Run"])
        shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference(
            currentStatus["All"])

        self._ensureComponentRunning(shouldBe["Run"])
        self._ensureComponentDown(shouldBe["Down"])

        for instance in shouldBe["Unknown"]:
            self.logError("Unknown instance",
                          "%r, either uninstall or add to config" % instance)

        return S_OK()

    def _getCurrentComponentStatus(self):
        """Get current status for components."""
        resOverall = self.sysAdminClient.getOverallStatus()
        if not resOverall["OK"]:
            return resOverall
        currentStatus = {"Down": set(), "Run": set(), "All": set()}
        informationDict = resOverall["Value"]
        for systemsDict in informationDict.values():
            for system, instancesDict in systemsDict.items():
                for instanceName, instanceInfoDict in instancesDict.items():
                    identifier = "%s__%s" % (system, instanceName)
                    runitStatus = instanceInfoDict.get("RunitStatus")
                    if runitStatus in ("Run", "Down"):
                        currentStatus[runitStatus].add(identifier)

        currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"]
        return S_OK(currentStatus)

    def _getDefaultComponentStatus(self):
        """Get the configured status of the components."""
        host = socket.getfqdn()
        defaultStatus = {"Down": set(), "Run": set(), "All": set()}
        resRunning = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Running"))
        resStopped = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Stopped"))
        if not resRunning["OK"]:
            return resRunning
        if not resStopped["OK"]:
            return resStopped
        defaultStatus["Run"] = set(resRunning["Value"])
        defaultStatus["Down"] = set(resStopped["Value"])
        defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"]

        if defaultStatus["Run"].intersection(defaultStatus["Down"]):
            self.logError(
                "Overlap in configuration",
                str(defaultStatus["Run"].intersection(defaultStatus["Down"])))
            return S_ERROR("Bad host configuration")

        return S_OK(defaultStatus)

    def _ensureComponentRunning(self, shouldBeRunning):
        """Ensure the correct components are running."""
        for instance in shouldBeRunning:
            self.log.info("Starting instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.startComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to start component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was down, started instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is down, should be started"

    def _ensureComponentDown(self, shouldBeDown):
        """Ensure the correct components are not running."""
        for instance in shouldBeDown:
            self.log.info("Stopping instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.stopComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to stop component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was running, stopped instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is running, should be stopped"

    def checkURLs(self):
        """Ensure that the running services have their URL in the Config."""
        self.log.info("Checking URLs")
        # get services again, in case they were started/stop in controlComponents
        gConfig.forceRefresh(fromMaster=True)

        # get port used for https based services
        try:
            tornadoSystemInstance = PathFinder.getSystemInstance(
                system="Tornado",
                setup=self.setup,
            )
            self._tornadoPort = gConfig.getValue(
                Path.cfgPath("/System/Tornado/", tornadoSystemInstance,
                             "Port"),
                self._tornadoPort,
            )
        except RuntimeError:
            pass

        self.log.debug("Using Tornado Port:", self._tornadoPort)

        res = self.getRunningInstances(instanceType="Services",
                                       runitStatus="All")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]
        for service, options in sorted(self.services.items()):
            self.log.debug("Checking URL for %s with options %s" %
                           (service, options))
            # ignore SystemAdministrator, does not have URLs
            if "SystemAdministrator" in service:
                continue
            self._checkServiceURL(service, options)

        if self.csAPI.csModified and self.commitURLs:
            self.log.info("Commiting changes to the CS")
            result = self.csAPI.commit()
            if not result["OK"]:
                self.logError("Commit to CS failed", result["Message"])
                return S_ERROR("Failed to commit to CS")
        return S_OK()

    def _checkServiceURL(self, serviceName, options):
        """Ensure service URL is properly configured in the CS."""
        url = self._getURL(serviceName, options)
        system = options["System"]
        module = options["Module"]
        self.log.info("Checking URLs for %s/%s" % (system, module))
        urlsConfigPath = Path.cfgPath(
            PathFinder.getSystemURLSection(system=system, setup=self.setup),
            module)
        urls = gConfig.getValue(urlsConfigPath, [])
        self.log.debug("Found configured URLs for %s: %s" % (module, urls))
        self.log.debug("This URL is %s" % url)
        runitStatus = options["RunitStatus"]
        wouldHave = "Would have " if not self.commitURLs else ""
        if runitStatus == "Run" and url not in urls:
            urls.append(url)
            message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url,
                                                            system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
        if runitStatus == "Down" and url in urls:
            urls.remove(url)
            message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url,
                                                                system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

    def _getURL(self, serviceName, options):
        """Return URL for the service."""
        system = options["System"]
        port = options.get("Port", self._tornadoPort)
        host = socket.getfqdn()
        protocol = options.get("Protocol", "dips")
        url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName)
        return url