Пример #1
0
  def submit( self ):

    """
    Returns flatten list of components (services, agents) installed on hosts
    returned by getHosts function
    """

    checkUserCredentials()
    DN = getUserDN()
    group = getSelectedGroup()

    callback = list()
    
    request = self.request()
    if not 'Hostname' in request:
      return { "success" : "false" , "error" : "Name of the host is absent" }
    
    host = request[ 'Hostname' ]
    client = SystemAdministratorClient( host , None , delegatedDN=DN ,
                                          delegatedGroup=group )
    result = client.getOverallStatus()
    gLogger.debug( "Result of getOverallStatus(): %s" % result )

    if not result[ "OK" ]:
      return { "success" : "false" , "error" : result[ "Message" ] }
    overall = result[ "Value" ]

    for record in self.flatten( overall ):
      record[ "Host" ] = host
      callback.append( record )

    return { "success" : "true" , "result" : callback }
    def do_show(self, args):
        """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show host          - show host related parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

        argss = args.split()
        if not argss:
            print self.do_show.__doc__
            return

        option = argss[0]
        del argss[0]

        if option == 'software':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getSoftwareComponents()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'installed':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getInstalledComponents()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'setup':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getSetupComponents()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'project':
            result = SystemAdministratorClient(self.host,
                                               self.port).getProject()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print "Current project is %s" % result['Value']
        elif option == 'status':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getOverallStatus()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                fields = [
                    "System", 'Name', 'Module', 'Type', 'Setup', 'Installed',
                    'Runit', 'Uptime', 'PID'
                ]
                records = []
                rDict = result['Value']
                for compType in rDict:
                    for system in rDict[compType]:
                        components = rDict[compType][system].keys()
                        components.sort()
                        for component in components:
                            record = []
                            if rDict[compType][system][component]['Installed']:
                                module = str(rDict[compType][system][component]
                                             ['Module'])
                                record += [
                                    system, component, module,
                                    compType.lower()[:-1]
                                ]
                                if rDict[compType][system][component]['Setup']:
                                    record += ['Setup']
                                else:
                                    record += ['NotSetup']
                                if rDict[compType][system][component][
                                        'Installed']:
                                    record += ['Installed']
                                else:
                                    record += ['NotInstalled']
                                record += [
                                    str(rDict[compType][system][component]
                                        ['RunitStatus'])
                                ]
                                record += [
                                    str(rDict[compType][system][component]
                                        ['Timeup'])
                                ]
                                record += [
                                    str(rDict[compType][system][component]
                                        ['PID'])
                                ]
                                records.append(record)
                printTable(fields, records)
        elif option == 'database' or option == 'databases':
            client = SystemAdministratorClient(self.host, self.port)
            if not InstallTools.mysqlPassword:
                InstallTools.mysqlPassword = "******"
            InstallTools.getMySQLPasswords()
            result = client.getDatabases(InstallTools.mysqlRootPwd)
            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            resultSW = client.getAvailableDatabases()
            if not resultSW['OK']:
                self.__errMsg(resultSW['Message'])
                return

            sw = resultSW['Value']
            installed = result['Value']
            print
            for db in sw:
                if db in installed:
                    print db.rjust(25), ': Installed'
                else:
                    print db.rjust(25), ': Not installed'
            if not sw:
                print "No database found"
        elif option == 'mysql':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getMySQLStatus()
            if not result['OK']:
                self.__errMsg(result['Message'])
            elif result['Value']:
                print
                for par, value in result['Value'].items():
                    print par.rjust(28), ':', value
            else:
                print "No MySQL database found"
        elif option == "log":
            self.getLog(argss)
        elif option == "info":
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getInfo()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print
                print "Setup:", result['Value']['Setup']
                print "DIRAC version:", result['Value']['DIRAC']
                if result['Value']['Extensions']:
                    for e, v in result['Value']['Extensions'].items():
                        print "%s version" % e, v
                print
        elif option == "host":
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getHostInfo()
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print
                print "Host info:"
                print

                fields = ['Parameter', 'Value']
                records = []
                for key, value in result['Value'].items():
                    records.append([key, str(value)])

                printTable(fields, records)

        elif option == "errors":
            self.getErrors(argss)
        else:
            print "Unknown option:", option
Пример #3
0
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show host          - show host related parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      print self.do_show.__doc__
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "Current project is %s" % result[ 'Value' ]
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)  
        printTable(fields,records)        
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self.__errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      print
      for db in sw:
        if db in installed:
          print db.rjust( 25 ), ': Installed'
        else:
          print db.rjust( 25 ), ': Not installed'
      if not sw:
        print "No database found"
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      elif result['Value']:
        print
        for par, value in result['Value'].items():
          print par.rjust( 28 ), ':', value
      else:
        print "No MySQL database found"
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        print "Setup:", result['Value']['Setup']
        print "DIRAC version:", result['Value']['DIRAC']
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            print "%s version" % e, v
        print
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:   
        print   
        print "Host info:"
        print
        
        fields = ['Parameter','Value']
        records = []
        for key,value in result['Value'].items():
          records.append( [key, str(value) ] )
          
        printTable( fields, records )  
            
    elif option == "errors":
      self.getErrors( argss )
    else:
      print "Unknown option:", option
Пример #4
0
class LemonAgent( AgentModule ):

  def initialize( self ):
    self.NON_CRITICAL = "NonCritical"
    self.CRITICAL = "Critical"
    self.FAILURE = "FAILURE"
    self.OK = "OK"

    self.setup = gConfig.getValue('/DIRAC/Setup','LHCb-Development')
    self.outputNonCritical = True
    #all components not present here will be treated as non critical

    self.admClient = SystemAdministratorClient('localhost')

    return S_OK()

  def execute( self ):
    """ Main execution method
    """

    monitoredSetups = gConfig.getValue('/Operations/lhcb/Lemon/MonitoredSetups', ['LHCb-Production'])
    self.monitoringEnabled = self.setup in monitoredSetups

    if not self.monitoringEnabled:
      self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "Monitoring not enabled for this setup: " + self.setup +". Exiting.");
      return S_OK()

    hostsInMaintenance = gConfig.getValue('/Operations/lhcb/Lemon/HostsInMaintenance',[]);
    if gethostname() in hostsInMaintenance:
      self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "I am in maintenance mode, exiting.");
      return S_OK()

    result = self.admClient.getOverallStatus()

    if not result or not result['OK']:
      self._log("Framework/LemonAgent", self.CRITICAL, self.FAILURE, "Can not obtain result!!");
      return S_OK()

    services = result[ 'Value' ][ 'Services' ]
    agents = result[ 'Value' ][ 'Agents' ]
    self._processResults(services);
    self._processResults(agents);

    return S_OK()

  def _processResults(self, results):
    for system in results:
      for part in results[system]:
        component = results[system][part]
        componentName = system + "/" + part
        if component['Setup'] == True:   #we want to monitor only set up services and agents
          critLevel = self._getCriticality(componentName)
          if critLevel == self.NON_CRITICAL and self.outputNonCritical == False:
            continue
          if component['RunitStatus'] == 'Run':
            self._log(componentName, self._getCriticality(componentName), self.OK, "Service/Agent running fine");
          else:
            self._log(componentName, self._getCriticality(componentName), self.FAILURE, "Service/Agent failure!");
    #    else:
    #      if component['Installed'] == True:
    #        print componentName + " is installed but not set up"

  def _getCriticality(self, component):
    #lets try to retrieve common criticality first
    criticality = gConfig.getValue('/Operations/lhcb/Lemon/Criticalities/' + component, self.NON_CRITICAL)
    #maybe it got redefined in <setup> subtree:
    criticality = gConfig.getValue('/Operations/lhcb/' + self.setup + '/Lemon/Criticalities/' + component, criticality)
    return criticality

  def _log( self, component, criticality, status, string ):
    gLogger.info( "LEMON " + criticality + " " + status + " " + component + ": " +string + "\n")
Пример #5
0
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show doc <type> <system> <name>
                             - show documentation for a given service or agent
          show host          - show host related parameters
          show hosts         - show all available hosts
          show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before>
                              | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]*
                             - show all the installations of components that match the given parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      gLogger.notice( self.do_show.__doc__ )
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( "Current project is %s" % result[ 'Value' ] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)  
        printTable(fields,records)        
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self.__errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      gLogger.notice( '' )
      for db in sw:
        if db in installed:
          gLogger.notice( db.rjust( 25 ), ': Installed' )
        else:
          gLogger.notice( db.rjust( 25 ), ': Not installed' )
      if not sw:
        gLogger.notice( "No database found" )
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      elif result['Value']:
        gLogger.notice( '' )
        for par, value in result['Value'].items():
          gLogger.notice( ( par.rjust( 28 ), ':', value ) )
      else:
        gLogger.notice( "No MySQL database found" )
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        gLogger.notice( "Setup:", result['Value']['Setup'] )
        gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] )
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            gLogger.notice( "%s version" % e, v )
        gLogger.notice( '' )
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:   
        gLogger.notice( '' )
        gLogger.notice( "Host info:" )
        gLogger.notice( '' )
        
        fields = ['Parameter','Value']
        records = []
        for key, value in result['Value'].items():
          records.append( [key, str( value ) ] )
          
        printTable( fields, records )  
    elif option == "hosts":
      client = ComponentMonitoringClient()
      result = client.getHosts( {}, False, False )
      if not result[ 'OK' ]:
        self.__errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) )
      else:
        hostList = result[ 'Value' ]
        gLogger.notice( '' )
        gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' )
        gLogger.notice( ( '-' * 69 ) )
        for element in hostList:
          gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' )
        gLogger.notice( ( '-' * 69 ) )
        gLogger.notice( '' )
    elif option == "errors":
      self.getErrors( argss )
    elif option == "installations":
      self.getInstallations( argss )
    elif option == "doc":
      if len( argss ) > 2:
        if argss[0] in [ 'service', 'agent' ]:
          compType = argss[0]
          compSystem = argss[1]
          compModule = argss[2]
          client = SystemAdministratorClient( self.host, self.port )
          result = client.getComponentDocumentation( compType, compSystem, compModule )
          if result[ 'OK' ]:
            gLogger.notice( result[ 'Value' ] )
          else:
            self.__errMsg( result[ 'Message' ] )
        else:
          gLogger.notice( self.do_show.__doc__ )
      else:
        gLogger.notice( self.do_show.__doc__ )
    else:
      gLogger.notice( "Unknown option:", option )
  def do_show( self, args ):
    """ 
        Show list of components with various related information
        
        usage:
    
          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show errors [*|<system> <service|agent>] 
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      print self.do_show.__doc__
      return

    option = argss[0]
    del argss[0]
    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        print "ERROR:", result['Message']
      else:
        rDict = result['Value']
        print
        print "   System", ' '*20, 'Name', ' '*15, 'Type', ' '*13, 'Setup    Installed   Runit    Uptime    PID'
        print '-' * 116
        for compType in rDict:
          for system in rDict[compType]:
            for component in rDict[compType][system]:
              if rDict[compType][system][component]['Installed']:
                print  system.ljust( 28 ), component.ljust( 28 ), compType.lower()[:-1].ljust( 7 ),
                if rDict[compType][system][component]['Setup']:
                  print 'SetUp'.rjust( 12 ),
                else:
                  print 'NotSetup'.rjust( 12 ),
                if rDict[compType][system][component]['Installed']:
                  print 'Installed'.rjust( 12 ),
                else:
                  print 'NotInstalled'.rjust( 12 ),
                print str( rDict[compType][system][component]['RunitStatus'] ).ljust( 7 ),
                print str( rDict[compType][system][component]['Timeup'] ).rjust( 7 ),
                print str( rDict[compType][system][component]['PID'] ).rjust( 8 ),
                print
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        print "ERROR:", result['Message']
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        print "ERROR:", resultSW['Message']
        return

      sw = resultSW['Value']
      installed = result['Value']
      print
      for db in sw:
        if db in installed:
          print db.rjust( 25 ), ': Installed'
        else:
          print db.rjust( 25 ), ': Not installed'
      if not sw:
        print "No database found"
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        print "ERROR:", result['Message']
      elif result['Value']:
        print
        for par, value in result['Value'].items():
          print par.rjust( 28 ), ':', value
      else:
        print "No MySQL database found"
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        print "ERROR:", result['Message']
      else:
        print
        print "Setup:", result['Value']['Setup']
        print "DIRAC version:", result['Value']['DIRAC']
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            print "%s version" % e, v
        print
    elif option == "errors":    
      self.getErrors( argss )
    else:
      print "Unknown option:", option
Пример #7
0
class MonitorAgents(AgentModule):
  """MonitorAgents class."""

  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()

  def logError(self, errStr, varMsg=''):
    """Append errors to a list, which is sent in email notification."""
    self.log.error(errStr, varMsg)
    self.errors.append(errStr + " " + varMsg)

  def beginExecution(self):
    """Reload the configurations before every cycle."""
    self.setup = self.am_getOption("Setup", self.setup)
    self.enabled = self.am_getOption("EnableFlag", self.enabled)
    self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents)
    self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors)
    self.restartServices = self.am_getOption("RestartServices", self.restartServices)
    self.diracLocation = os.environ.get("DIRAC", self.diracLocation)
    self.addressTo = self.am_getOption('MailTo', self.addressTo)
    self.addressFrom = self.am_getOption('MailFrom', self.addressFrom)
    self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents)
    self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs)

    self.csAPI = CSAPI()

    res = self.getRunningInstances(instanceType='Agents')
    if not res["OK"]:
      return S_ERROR("Failure to get running agents")
    self.agents = res["Value"]

    res = self.getRunningInstances(instanceType='Executors')
    if not res["OK"]:
      return S_ERROR("Failure to get running executors")
    self.executors = res["Value"]

    res = self.getRunningInstances(instanceType='Services')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]

    self.accounting.clear()
    return S_OK()

  def sendNotification(self):
    """Send email notification about changes done in the last cycle."""
    if not(self.errors or self.accounting):
      return S_OK()

    emailBody = ""
    rows = []
    for instanceName, val in self.accounting.iteritems():
      rows.append([[instanceName],
                   [val.get('Treatment', 'No Treatment')],
                   [str(val.get('LogAge', 'Not Relevant'))]])

    if rows:
      columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
      emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ')

    if self.errors:
      emailBody += "\n\nErrors:"
      emailBody += "\n".join(self.errors)

    self.log.notice("Sending Email:\n" + emailBody)
    for address in self.addressTo:
      res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False)
      if not res['OK']:
        self.log.error("Failure to send Email notification to ", address)
        continue

    self.errors = []
    self.accounting.clear()

    return S_OK()

  def getRunningInstances(self, instanceType='Agents', runitStatus='Run'):
    """Return a dict of running agents, executors or services.

    Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

    :param str instanceType: 'Agents', 'Executors', 'Services'
    :param str runitStatus: Return only those instances with given RunitStatus or 'All'
    :returns: Dictionary of running instances
    """
    res = self.sysAdminClient.getOverallStatus()
    if not res["OK"]:
      self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"])
      return res

    val = res['Value'][instanceType]
    runningAgents = defaultdict(dict)
    for system, agents in val.iteritems():
      for agentName, agentInfo in agents.iteritems():
        if agentInfo['Setup'] and agentInfo['Installed']:
          if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus:
            continue
          confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName)
          for option, default in (('PollingTime', HOUR), ('Port', None)):
            optPath = os.path.join(confPath, option)
            runningAgents[agentName][option] = gConfig.getValue(optPath, default)
          runningAgents[agentName]["LogFileLocation"] = \
              os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current')
          runningAgents[agentName]["PID"] = agentInfo["PID"]
          runningAgents[agentName]['Module'] = agentInfo['Module']
          runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus']
          runningAgents[agentName]['System'] = system

    return S_OK(runningAgents)

  def on_terminate(self, agentName, process):
    """Execute callback when a process terminates gracefully."""
    self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid))

  def execute(self):
    """Execute checks for agents, executors, services."""
    for instanceType in ('executor', 'agent', 'service'):
      for name, options in getattr(self, instanceType + 's').iteritems():
        # call checkAgent, checkExecutor, checkService
        res = getattr(self, 'check' + instanceType.capitalize())(name, options)
        if not res['OK']:
          self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message']))

    res = self.componentControl()
    if not res['OK']:
      if "Stopped does not exist" not in res['Message'] and \
         "Running does not exist" not in res['Message']:
        self.logError("Failure to control components", res['Message'])

    if not self.errors:
      res = self.checkURLs()
      if not res['OK']:
        self.logError("Failure to check URLs", res['Message'])
    else:
      self.logError('Something was wrong before, not checking URLs this time')

    self.sendNotification()

    if self.errors:
      return S_ERROR("Error during this cycle, check log")

    return S_OK()

  @staticmethod
  def getLastAccessTime(logFileLocation):
    """Return the age of log file."""
    lastAccessTime = 0
    try:
      lastAccessTime = os.path.getmtime(logFileLocation)
      lastAccessTime = datetime.fromtimestamp(lastAccessTime)
    except OSError as e:
      return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e))

    now = datetime.now()
    age = now - lastAccessTime
    return S_OK(age)

  def restartInstance(self, pid, instanceName, enabled):
    """Kill a process which is then restarted automatically."""
    if not (self.enabled and enabled):
      self.log.info("Restarting is disabled, please restart %s manually" % instanceName)
      self.accounting[instanceName]["Treatment"] = "Please restart it manually"
      return S_OK(NO_RESTART)

    try:
      agentProc = psutil.Process(int(pid))
      processesToTerminate = agentProc.children(recursive=True)
      processesToTerminate.append(agentProc)

      for proc in processesToTerminate:
        proc.terminate()

      _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5,
                                       callback=partial(self.on_terminate, instanceName))
      for proc in alive:
        self.log.info("Forcefully killing process %s" % proc.pid)
        proc.kill()

      return S_OK()

    except psutil.Error as err:
      self.logError("Exception occurred in terminating processes", "%s" % err)
      return S_ERROR()

  def checkService(self, serviceName, options):
    """Ping the service, restart if the ping does not respond."""
    url = self._getURL(serviceName, options)
    self.log.info("Pinging service", url)
    pingRes = Client().ping(url=url)
    if not pingRes['OK']:
      self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message']))
      res = self.restartInstance(int(options['PID']), serviceName, self.restartServices)
      if not res["OK"]:
        return res
      elif res['OK'] and res['Value'] != NO_RESTART:
        self.accounting[serviceName]["Treatment"] = "Successfully Restarted"
        self.log.info("Agent %s has been successfully restarted" % serviceName)
    self.log.info("Service responded OK")
    return S_OK()

  def checkAgent(self, agentName, options):
    """Check the age of agent's log file, if it is too old then restart the agent."""
    pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID']
    self.log.info("Checking Agent: %s" % agentName)
    self.log.info("Polling Time: %s" % pollingTime)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES)))

    maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
    if age.seconds < maxLogAge:
      return S_OK()

    self.log.info("Current log file is too old for Agent %s" % agentName)
    self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

    res = self.restartInstance(int(pid), agentName, self.restartAgents)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[agentName]["Treatment"] = "Successfully Restarted"
      self.log.info("Agent %s has been successfully restarted" % agentName)

    return S_OK()

  def checkExecutor(self, executor, options):
    """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
    currentLogLocation = options['LogFileLocation']
    pid = options['PID']
    self.log.info("Checking executor: %s" % executor)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES)))

    if age.seconds < 2 * HOUR:
      return S_OK()

    self.log.info("Current log file is too old for Executor %s" % executor)
    self.accounting[executor]["LogAge"] = age.seconds / MINUTES

    res = self.checkForCheckingJobs(executor)
    if not res['OK']:
      return res
    if res['OK'] and res['Value'] == NO_CHECKING_JOBS:
      self.accounting.pop(executor, None)
      return S_OK(NO_RESTART)

    res = self.restartInstance(int(pid), executor, self.restartExecutors)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[executor]["Treatment"] = "Successfully Restarted"
      self.log.info("Executor %s has been successfully restarted" % executor)

    return S_OK()

  def checkForCheckingJobs(self, executorName):
    """Check if there are checking jobs with the **executorName** as current MinorStatus."""
    attrDict = {'Status': 'Checking', 'MinorStatus': executorName}

    # returns list of jobs IDs
    resJobs = self.jobMonClient.getJobs(attrDict)
    if not resJobs['OK']:
      self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message']))
      return resJobs
    if resJobs['Value']:
      self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName))
      return S_OK(CHECKING_JOBS)
    self.log.info("Found no jobs in 'Checking' status for %s" % executorName)
    return S_OK(NO_CHECKING_JOBS)

  def componentControl(self):
    """Monitor and control component status as defined in the CS.

    Check for running and stopped components and ensure they have the proper status as defined in the CS
    Registry/Hosts/_HOST_/[Running|Stopped] sections

    :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
       :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
    """
    # get the current status of the components

    resCurrent = self._getCurrentComponentStatus()
    if not resCurrent['OK']:
      return resCurrent
    currentStatus = resCurrent['Value']

    resDefault = self._getDefaultComponentStatus()
    if not resDefault['OK']:
      return resDefault
    defaultStatus = resDefault['Value']

    # ensure instances are in the right state
    shouldBe = {}
    shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down'])
    shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run'])
    shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All'])

    self._ensureComponentRunning(shouldBe['Run'])
    self._ensureComponentDown(shouldBe['Down'])

    for instance in shouldBe['Unknown']:
      self.logError("Unknown instance", "%r, either uninstall or add to config" % instance)

    return S_OK()

  def _getCurrentComponentStatus(self):
    """Get current status for components."""
    resOverall = self.sysAdminClient.getOverallStatus()
    if not resOverall['OK']:
      return resOverall
    currentStatus = {'Down': set(), 'Run': set(), 'All': set()}
    informationDict = resOverall['Value']
    for systemsDict in informationDict.values():
      for system, instancesDict in systemsDict.items():
        for instanceName, instanceInfoDict in instancesDict.items():
          identifier = '%s__%s' % (system, instanceName)
          runitStatus = instanceInfoDict.get('RunitStatus')
          if runitStatus in ('Run', 'Down'):
            currentStatus[runitStatus].add(identifier)

    currentStatus['All'] = currentStatus['Run'] | currentStatus['Down']
    return S_OK(currentStatus)

  def _getDefaultComponentStatus(self):
    """Get the configured status of the components."""
    host = socket.gethostname()
    defaultStatus = {'Down': set(), 'Run': set(), 'All': set()}
    resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running'))
    resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped'))
    if not resRunning['OK']:
      return resRunning
    if not resStopped['OK']:
      return resStopped
    defaultStatus['Run'] = set(resRunning['Value'].keys())
    defaultStatus['Down'] = set(resStopped['Value'].keys())
    defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down']

    if defaultStatus['Run'].intersection(defaultStatus['Down']):
      self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down'])))
      return S_ERROR("Bad host configuration")

    return S_OK(defaultStatus)

  def _ensureComponentRunning(self, shouldBeRunning):
    """Ensure the correct components are running."""
    for instance in shouldBeRunning:
      self.log.info("Starting instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.startComponent(system, name)
        if not res['OK']:
          self.logError("Failed to start component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was down, started instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is down, should be started"

  def _ensureComponentDown(self, shouldBeDown):
    """Ensure the correct components are not running."""
    for instance in shouldBeDown:
      self.log.info("Stopping instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.stopComponent(system, name)
        if not res['OK']:
          self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was running, stopped instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is running, should be stopped"

  def checkURLs(self):
    """Ensure that the running services have their URL in the Config."""
    self.log.info("Checking URLs")
    # get services again, in case they were started/stop in controlComponents
    gConfig.forceRefresh(fromMaster=True)
    res = self.getRunningInstances(instanceType='Services', runitStatus='All')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]
    for service, options in self.services.iteritems():
      self.log.debug("Checking URL for %s with options %s" % (service, options))
      # ignore SystemAdministrator, does not have URLs
      if 'SystemAdministrator' in service:
        continue
      self._checkServiceURL(service, options)

    if self.csAPI.csModified and self.commitURLs:
      self.log.info("Commiting changes to the CS")
      result = self.csAPI.commit()
      if not result['OK']:
        self.logError('Commit to CS failed', result['Message'])
        return S_ERROR("Failed to commit to CS")
    return S_OK()

  def _checkServiceURL(self, serviceName, options):
    """Ensure service URL is properly configured in the CS."""
    url = self._getURL(serviceName, options)
    system = options['System']
    module = options['Module']
    self.log.info("Checking URLs for %s/%s" % (system, module))
    urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module)
    urls = gConfig.getValue(urlsConfigPath, [])
    self.log.debug("Found configured URLs for %s: %s" % (module, urls))
    self.log.debug("This URL is %s" % url)
    runitStatus = options['RunitStatus']
    wouldHave = 'Would have ' if not self.commitURLs else ''
    if runitStatus == 'Run' and url not in urls:
      urls.append(url)
      message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
    if runitStatus == 'Down' and url in urls:
      urls.remove(url)
      message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

  @staticmethod
  def _getURL(serviceName, options):
    """Return URL for the service."""
    system = options['System']
    port = options['Port']
    host = socket.gethostname()
    url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName)
    return url
Пример #8
0
class ComponentSupervisionAgent(AgentModule):
    """ComponentSupervisionAgent class."""
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )

    def logError(self, errStr, varMsg=""):
        """Append errors to a list, which is sent in email notification."""
        self.log.error(errStr, varMsg)
        self.errors.append(errStr + " " + varMsg)

    def beginExecution(self):
        """Reload the configurations before every cycle."""
        self.setup = self.am_getOption("Setup", self.setup)
        self.enabled = self.am_getOption("EnableFlag", self.enabled)
        self.restartAgents = self.am_getOption("RestartAgents",
                                               self.restartAgents)
        self.restartExecutors = self.am_getOption("RestartExecutors",
                                                  self.restartExecutors)
        self.restartServices = self.am_getOption("RestartServices",
                                                 self.restartServices)
        self.addressTo = self.am_getOption("MailTo", self.addressTo)
        self.addressFrom = self.am_getOption("MailFrom", self.addressFrom)
        self.controlComponents = self.am_getOption("ControlComponents",
                                                   self.controlComponents)
        self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs)
        self.doNotRestartInstancePattern = self.am_getOption(
            "DoNotRestartInstancePattern", self.doNotRestartInstancePattern)

        self.csAPI = CSAPI()

        res = self.getRunningInstances(instanceType="Agents")
        if not res["OK"]:
            return S_ERROR("Failure to get running agents")
        self.agents = res["Value"]

        res = self.getRunningInstances(instanceType="Executors")
        if not res["OK"]:
            return S_ERROR("Failure to get running executors")
        self.executors = res["Value"]

        res = self.getRunningInstances(instanceType="Services")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]

        self.accounting.clear()
        return S_OK()

    def sendNotification(self):
        """Send email notification about changes done in the last cycle."""
        if not (self.errors or self.accounting):
            return S_OK()

        emailBody = ""
        rows = []
        for instanceName, val in self.accounting.items():
            rows.append([[instanceName],
                         [val.get("Treatment", "No Treatment")],
                         [str(val.get("LogAge", "Not Relevant"))]])

        if rows:
            columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
            emailBody += printTable(columns,
                                    rows,
                                    printOut=False,
                                    numbering=False,
                                    columnSeparator=" | ")

        if self.errors:
            emailBody += "\n\nErrors:"
            emailBody += "\n".join(self.errors)

        self.log.notice("Sending Email:\n" + emailBody)
        for address in self.addressTo:
            res = self.nClient.sendMail(address,
                                        self.emailSubject,
                                        emailBody,
                                        self.addressFrom,
                                        localAttempt=False)
            if not res["OK"]:
                self.log.error("Failure to send Email notification to ",
                               address)
                continue

        self.errors = []
        self.accounting.clear()

        return S_OK()

    def getRunningInstances(self, instanceType="Agents", runitStatus="Run"):
        """Return a dict of running agents, executors or services.

        Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

        :param str instanceType: 'Agents', 'Executors', 'Services'
        :param str runitStatus: Return only those instances with given RunitStatus or 'All'
        :returns: Dictionary of running instances
        """
        res = self.sysAdminClient.getOverallStatus()
        if not res["OK"]:
            self.logError(
                "Failure to get %s from system administrator client" %
                instanceType, res["Message"])
            return res

        val = res["Value"][instanceType]
        runningComponents = defaultdict(dict)
        for system, components in val.items():
            for componentName, componentInfo in components.items():
                if componentInfo["Setup"] and componentInfo["Installed"]:
                    if runitStatus != "All" and componentInfo[
                            "RunitStatus"] != runitStatus:
                        continue
                    for option, default in (("PollingTime", HOUR),
                                            ("Port", None), ("Protocol",
                                                             None)):
                        runningComponents[componentName][
                            option] = self._getComponentOption(
                                instanceType, system, componentName, option,
                                default)
                        # remove empty values so we can use defaults in _getURL
                        if not runningComponents[componentName][option]:
                            runningComponents[componentName].pop(option)
                    runningComponents[componentName][
                        "LogFileLocation"] = os.path.join(
                            self.diracLocation, "runit", system, componentName,
                            "log", "current")
                    runningComponents[componentName]["PID"] = componentInfo[
                        "PID"]
                    runningComponents[componentName]["Module"] = componentInfo[
                        "Module"]
                    runningComponents[componentName][
                        "RunitStatus"] = componentInfo["RunitStatus"]
                    runningComponents[componentName]["System"] = system

        return S_OK(runningComponents)

    def _getComponentOption(self, instanceType, system, componentName, option,
                            default):
        """Get component option from DIRAC CS, using components' base classes methods."""
        componentPath = PathFinder.getComponentSection(
            system=system,
            component=componentName,
            setup=self.setup,
            componentCategory=instanceType,
        )
        if instanceType != "Agents":
            return gConfig.getValue(Path.cfgPath(componentPath, option),
                                    default)
        # deal with agent configuration
        componentLoadModule = gConfig.getValue(
            Path.cfgPath(componentPath, "Module"), componentName)
        fullComponentName = Path.cfgPath(system, componentName)
        fullComponentLoadName = Path.cfgPath(system, componentLoadModule)
        return AgentModule(fullComponentName,
                           fullComponentLoadName).am_getOption(
                               option, default)

    def on_terminate(self, componentName, process):
        """Execute callback when a process terminates gracefully."""
        self.log.info(
            "%s's process with ID: %s has been terminated successfully" %
            (componentName, process.pid))

    def execute(self):
        """Execute checks for agents, executors, services."""
        for instanceType in ("executor", "agent", "service"):
            for name, options in getattr(self, instanceType + "s").items():
                # call checkAgent, checkExecutor, checkService
                res = getattr(self,
                              "check" + instanceType.capitalize())(name,
                                                                   options)
                if not res["OK"]:
                    self.logError("Failure when checking %s" % instanceType,
                                  "%s, %s" % (name, res["Message"]))

        res = self.componentControl()
        if not res["OK"]:
            if "Stopped does not exist" not in res[
                    "Message"] and "Running does not exist" not in res[
                        "Message"]:
                self.logError("Failure to control components", res["Message"])

        if not self.errors:
            res = self.checkURLs()
            if not res["OK"]:
                self.logError("Failure to check URLs", res["Message"])
        else:
            self.logError(
                "Something was wrong before, not checking URLs this time")

        self.sendNotification()

        if self.errors:
            return S_ERROR("Error during this cycle, check log")

        return S_OK()

    @staticmethod
    def getLastAccessTime(logFileLocation):
        """Return the age of log file."""
        lastAccessTime = 0
        try:
            lastAccessTime = os.path.getmtime(logFileLocation)
            lastAccessTime = datetime.fromtimestamp(lastAccessTime)
        except OSError as e:
            return S_ERROR("Failed to access logfile %s: %r" %
                           (logFileLocation, e))

        now = datetime.now()
        age = now - lastAccessTime
        return S_OK(age)

    def restartInstance(self, pid, instanceName, enabled):
        """Kill a process which is then restarted automatically."""
        if not (self.enabled and enabled):
            self.log.info(
                "Restarting is disabled, please restart %s manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        if any(pattern in instanceName
               for pattern in self.doNotRestartInstancePattern):
            self.log.info(
                "Restarting for %s is disabled, please restart it manually" %
                instanceName)
            self.accounting[instanceName][
                "Treatment"] = "Please restart it manually"
            return S_OK(NO_RESTART)

        try:
            componentProc = psutil.Process(int(pid))
            processesToTerminate = componentProc.children(recursive=True)
            processesToTerminate.append(componentProc)

            for proc in processesToTerminate:
                proc.terminate()

            _gone, alive = psutil.wait_procs(processesToTerminate,
                                             timeout=5,
                                             callback=partial(
                                                 self.on_terminate,
                                                 instanceName))
            for proc in alive:
                self.log.info("Forcefully killing process %s" % proc.pid)
                proc.kill()

            return S_OK()

        except psutil.Error as err:
            self.logError("Exception occurred in terminating processes",
                          "%s" % err)
            return S_ERROR()

    def checkService(self, serviceName, options):
        """Ping the service, restart if the ping does not respond."""
        url = self._getURL(serviceName, options)
        self.log.info("Pinging service", url)
        pingRes = Client().ping(url=url)
        if not pingRes["OK"]:
            self.log.info("Failure pinging service: %s: %s" %
                          (url, pingRes["Message"]))
            res = self.restartInstance(int(options["PID"]), serviceName,
                                       self.restartServices)
            if not res["OK"]:
                return res
            if res["Value"] != NO_RESTART:
                self.accounting[serviceName][
                    "Treatment"] = "Successfully Restarted"
                self.log.info("Service %s has been successfully restarted" %
                              serviceName)
        self.log.info("Service responded OK")
        return S_OK()

    def checkAgent(self, agentName, options):
        """Check the age of agent's log file, if it is too old then restart the agent."""
        pollingTime, currentLogLocation, pid = (options["PollingTime"],
                                                options["LogFileLocation"],
                                                options["PID"])
        self.log.info("Checking Agent: %s" % agentName)
        self.log.info("Polling Time: %s" % pollingTime)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (agentName, (age.seconds / MINUTES)))

        maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
        if age.seconds < maxLogAge:
            return S_OK()

        self.log.info("Current log file is too old for Agent %s" % agentName)
        self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

        res = self.restartInstance(int(pid), agentName, self.restartAgents)
        if not res["OK"]:
            return res
        if res["Value"] != NO_RESTART:
            self.accounting[agentName]["Treatment"] = "Successfully Restarted"
            self.log.info("Agent %s has been successfully restarted" %
                          agentName)

        return S_OK()

    def checkExecutor(self, executor, options):
        """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
        currentLogLocation = options["LogFileLocation"]
        pid = options["PID"]
        self.log.info("Checking executor: %s" % executor)
        self.log.info("Current Log File location: %s" % currentLogLocation)

        res = self.getLastAccessTime(currentLogLocation)
        if not res["OK"]:
            return res

        age = res["Value"]
        self.log.info("Current log file for %s is %d minutes old" %
                      (executor, (age.seconds / MINUTES)))

        if age.seconds < 2 * HOUR:
            return S_OK()

        self.log.info("Current log file is too old for Executor %s" % executor)
        self.accounting[executor]["LogAge"] = age.seconds / MINUTES

        res = self.checkForCheckingJobs(executor)
        if not res["OK"]:
            return res
        if res["OK"] and res["Value"] == NO_CHECKING_JOBS:
            self.accounting.pop(executor, None)
            return S_OK(NO_RESTART)

        res = self.restartInstance(int(pid), executor, self.restartExecutors)
        if not res["OK"]:
            return res
        elif res["OK"] and res["Value"] != NO_RESTART:
            self.accounting[executor]["Treatment"] = "Successfully Restarted"
            self.log.info("Executor %s has been successfully restarted" %
                          executor)

        return S_OK()

    def checkForCheckingJobs(self, executorName):
        """Check if there are checking jobs with the **executorName** as current MinorStatus."""
        attrDict = {"Status": "Checking", "MinorStatus": executorName}

        # returns list of jobs IDs
        resJobs = self.jobMonClient.getJobs(attrDict)
        if not resJobs["OK"]:
            self.logError("Could not get jobs for this executor",
                          "%s: %s" % (executorName, resJobs["Message"]))
            return resJobs
        if resJobs["Value"]:
            self.log.info('Found %d jobs in "Checking" status for %s' %
                          (len(resJobs["Value"]), executorName))
            return S_OK(CHECKING_JOBS)
        self.log.info('Found no jobs in "Checking" status for %s' %
                      executorName)
        return S_OK(NO_CHECKING_JOBS)

    def componentControl(self):
        """Monitor and control component status as defined in the CS.

        Check for running and stopped components and ensure they have the proper status as defined in the CS
        Registry/Hosts/_HOST_/[Running|Stopped] sections

        :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
           :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
        """
        # get the current status of the components

        resCurrent = self._getCurrentComponentStatus()
        if not resCurrent["OK"]:
            return resCurrent
        currentStatus = resCurrent["Value"]

        resDefault = self._getDefaultComponentStatus()
        if not resDefault["OK"]:
            return resDefault
        defaultStatus = resDefault["Value"]

        # ensure instances are in the right state
        shouldBe = {}
        shouldBe["Run"] = defaultStatus["Run"].intersection(
            currentStatus["Down"])
        shouldBe["Down"] = defaultStatus["Down"].intersection(
            currentStatus["Run"])
        shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference(
            currentStatus["All"])

        self._ensureComponentRunning(shouldBe["Run"])
        self._ensureComponentDown(shouldBe["Down"])

        for instance in shouldBe["Unknown"]:
            self.logError("Unknown instance",
                          "%r, either uninstall or add to config" % instance)

        return S_OK()

    def _getCurrentComponentStatus(self):
        """Get current status for components."""
        resOverall = self.sysAdminClient.getOverallStatus()
        if not resOverall["OK"]:
            return resOverall
        currentStatus = {"Down": set(), "Run": set(), "All": set()}
        informationDict = resOverall["Value"]
        for systemsDict in informationDict.values():
            for system, instancesDict in systemsDict.items():
                for instanceName, instanceInfoDict in instancesDict.items():
                    identifier = "%s__%s" % (system, instanceName)
                    runitStatus = instanceInfoDict.get("RunitStatus")
                    if runitStatus in ("Run", "Down"):
                        currentStatus[runitStatus].add(identifier)

        currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"]
        return S_OK(currentStatus)

    def _getDefaultComponentStatus(self):
        """Get the configured status of the components."""
        host = socket.getfqdn()
        defaultStatus = {"Down": set(), "Run": set(), "All": set()}
        resRunning = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Running"))
        resStopped = gConfig.getOptionsDict(
            Path.cfgPath("/Registry/Hosts/", host, "Stopped"))
        if not resRunning["OK"]:
            return resRunning
        if not resStopped["OK"]:
            return resStopped
        defaultStatus["Run"] = set(resRunning["Value"])
        defaultStatus["Down"] = set(resStopped["Value"])
        defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"]

        if defaultStatus["Run"].intersection(defaultStatus["Down"]):
            self.logError(
                "Overlap in configuration",
                str(defaultStatus["Run"].intersection(defaultStatus["Down"])))
            return S_ERROR("Bad host configuration")

        return S_OK(defaultStatus)

    def _ensureComponentRunning(self, shouldBeRunning):
        """Ensure the correct components are running."""
        for instance in shouldBeRunning:
            self.log.info("Starting instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.startComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to start component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was down, started instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is down, should be started"

    def _ensureComponentDown(self, shouldBeDown):
        """Ensure the correct components are not running."""
        for instance in shouldBeDown:
            self.log.info("Stopping instance %s" % instance)
            system, name = instance.split("__")
            if self.controlComponents:
                res = self.sysAdminClient.stopComponent(system, name)
                if not res["OK"]:
                    self.logError("Failed to stop component:",
                                  "%s: %s" % (instance, res["Message"]))
                else:
                    self.accounting[instance][
                        "Treatment"] = "Instance was running, stopped instance"
            else:
                self.accounting[instance][
                    "Treatment"] = "Instance is running, should be stopped"

    def checkURLs(self):
        """Ensure that the running services have their URL in the Config."""
        self.log.info("Checking URLs")
        # get services again, in case they were started/stop in controlComponents
        gConfig.forceRefresh(fromMaster=True)

        # get port used for https based services
        try:
            tornadoSystemInstance = PathFinder.getSystemInstance(
                system="Tornado",
                setup=self.setup,
            )
            self._tornadoPort = gConfig.getValue(
                Path.cfgPath("/System/Tornado/", tornadoSystemInstance,
                             "Port"),
                self._tornadoPort,
            )
        except RuntimeError:
            pass

        self.log.debug("Using Tornado Port:", self._tornadoPort)

        res = self.getRunningInstances(instanceType="Services",
                                       runitStatus="All")
        if not res["OK"]:
            return S_ERROR("Failure to get running services")
        self.services = res["Value"]
        for service, options in sorted(self.services.items()):
            self.log.debug("Checking URL for %s with options %s" %
                           (service, options))
            # ignore SystemAdministrator, does not have URLs
            if "SystemAdministrator" in service:
                continue
            self._checkServiceURL(service, options)

        if self.csAPI.csModified and self.commitURLs:
            self.log.info("Commiting changes to the CS")
            result = self.csAPI.commit()
            if not result["OK"]:
                self.logError("Commit to CS failed", result["Message"])
                return S_ERROR("Failed to commit to CS")
        return S_OK()

    def _checkServiceURL(self, serviceName, options):
        """Ensure service URL is properly configured in the CS."""
        url = self._getURL(serviceName, options)
        system = options["System"]
        module = options["Module"]
        self.log.info("Checking URLs for %s/%s" % (system, module))
        urlsConfigPath = Path.cfgPath(
            PathFinder.getSystemURLSection(system=system, setup=self.setup),
            module)
        urls = gConfig.getValue(urlsConfigPath, [])
        self.log.debug("Found configured URLs for %s: %s" % (module, urls))
        self.log.debug("This URL is %s" % url)
        runitStatus = options["RunitStatus"]
        wouldHave = "Would have " if not self.commitURLs else ""
        if runitStatus == "Run" and url not in urls:
            urls.append(url)
            message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url,
                                                            system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
        if runitStatus == "Down" and url in urls:
            urls.remove(url)
            message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url,
                                                                system, module)
            self.log.info(message)
            self.accounting[serviceName + "/URL"]["Treatment"] = message
            self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

    def _getURL(self, serviceName, options):
        """Return URL for the service."""
        system = options["System"]
        port = options.get("Port", self._tornadoPort)
        host = socket.getfqdn()
        protocol = options.get("Protocol", "dips")
        url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName)
        return url
Пример #9
0
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show host          - show host related parameters
          show hosts         - show all available hosts
          show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before>
                              | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]*
                             - show all the installations of components that match the given parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      gLogger.notice( self.do_show.__doc__ )
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( "Current project is %s" % result[ 'Value' ] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)  
        printTable(fields,records)        
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self.__errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      gLogger.notice( '' )
      for db in sw:
        if db in installed:
          gLogger.notice( db.rjust( 25 ), ': Installed' )
        else:
          gLogger.notice( db.rjust( 25 ), ': Not installed' )
      if not sw:
        gLogger.notice( "No database found" )
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      elif result['Value']:
        gLogger.notice( '' )
        for par, value in result['Value'].items():
          gLogger.notice( ( par.rjust( 28 ), ':', value ) )
      else:
        gLogger.notice( "No MySQL database found" )
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        gLogger.notice( "Setup:", result['Value']['Setup'] )
        gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] )
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            gLogger.notice( "%s version" % e, v )
        gLogger.notice( '' )
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:   
        gLogger.notice( '' )
        gLogger.notice( "Host info:" )
        gLogger.notice( '' )
        
        fields = ['Parameter','Value']
        records = []
        for key, value in result['Value'].items():
          records.append( [key, str( value ) ] )
          
        printTable( fields, records )  
    elif option == "hosts":
      client = ComponentMonitoringClient()
      result = client.getHosts( {}, False, False )
      if not result[ 'OK' ]:
        self.__errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) )
      else:
        hostList = result[ 'Value' ]
        gLogger.notice( '' )
        gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' )
        gLogger.notice( ( '-' * 69 ) )
        for element in hostList:
          gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' )
        gLogger.notice( ( '-' * 69 ) )
        gLogger.notice( '' )
    elif option == "errors":
      self.getErrors( argss )
    elif option == "installations":
      self.getInstallations( argss )
    else:
      gLogger.notice( "Unknown option:", option )
Пример #10
0
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show doc <type> <system> <name>
                             - show documentation for a given service or agent
          show host          - show host related parameters
          show hosts         - show all available hosts
          show ports [host]  - show all ports used by a host. If no host is given, the host currently connected to is used
          show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before>
                              | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]*
                             - show all the installations of components that match the given parameters
          show profile <system> <component> [ -s <size> | -h <host> | -id <initial date DD/MM/YYYY> | -it <initial time hh:mm>
                              | -ed <end date DD/MM/YYYY | -et <end time hh:mm> ]*
                             - show <size> log lines of profiling information for a component in the machine <host>
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      gLogger.notice( self.do_show.__doc__ )
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( "Current project is %s" % result[ 'Value' ] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)
        printTable(fields,records)
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not gComponentInstaller.mysqlPassword:
        gComponentInstaller.mysqlPassword = "******"
      gComponentInstaller.getMySQLPasswords()
      result = client.getDatabases( gComponentInstaller.mysqlRootPwd )
      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self._errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      gLogger.notice( '' )
      for db in sw:
        if db in installed:
          gLogger.notice( db.rjust( 25 ), ': Installed' )
        else:
          gLogger.notice( db.rjust( 25 ), ': Not installed' )
      if not sw:
        gLogger.notice( "No database found" )
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self._errMsg( result['Message'] )
      elif result['Value']:
        gLogger.notice( '' )
        for par, value in result['Value'].items():
          gLogger.notice( ( par.rjust( 28 ), ':', value ) )
      else:
        gLogger.notice( "No MySQL database found" )
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        gLogger.notice( "Setup:", result['Value']['Setup'] )
        gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] )
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            gLogger.notice( "%s version" % e, v )
        gLogger.notice( '' )
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        gLogger.notice( "Host info:" )
        gLogger.notice( '' )

        fields = ['Parameter','Value']
        records = []
        for parameter in result['Value'].iteritems():
          if parameter[0] == 'Extension':
            extensions = parameter[1].split( ',' )
            for extension in extensions:
              extensionName, extensionVersion = extension.split( ':' )
              records.append( [ '%sVersion' % extensionName, str( extensionVersion ) ] )
          else:
            records.append( [ parameter[0], str( parameter[1] ) ] )

        printTable( fields, records )
    elif option == "hosts":
      client = ComponentMonitoringClient()
      result = client.getHosts( {}, False, False )
      if not result[ 'OK' ]:
        self._errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) )
      else:
        hostList = result[ 'Value' ]
        gLogger.notice( '' )
        gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' )
        gLogger.notice( ( '-' * 69 ) )
        for element in hostList:
          gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' )
        gLogger.notice( ( '-' * 69 ) )
        gLogger.notice( '' )
    elif option == "ports":
      if not argss:
        client = SystemAdministratorClient( self.host )
      else:
        hostname = argss[0]
        del argss[0]

        client = ComponentMonitoringClient()
        result = client.hostExists( { 'HostName': hostname } )
        if not result[ 'OK' ]:
          self._errMsg( result[ 'Message' ] )
          return
        else:
          if not result[ 'Value' ]:
            self._errMsg( 'Given host does not exist' )
            return

        client = SystemAdministratorClient( hostname )

      result = client.getUsedPorts()
      if not result[ 'OK' ]:
        self._errMsg( result[ 'Message' ] )
        return
      pprint.pprint( result[ 'Value' ] )
    elif option == "errors":
      self.getErrors( argss )
    elif option == "installations":
      self.getInstallations( argss )
    elif option == "doc":
      if len( argss ) > 2:
        if argss[0] in [ 'service', 'agent' ]:
          compType = argss[0]
          compSystem = argss[1]
          compModule = argss[2]
          client = SystemAdministratorClient( self.host, self.port )
          result = client.getComponentDocumentation( compType, compSystem, compModule )
          if result[ 'OK' ]:
            gLogger.notice( result[ 'Value' ] )
          else:
            self._errMsg( result[ 'Message' ] )
        else:
          gLogger.notice( self.do_show.__doc__ )
      else:
        gLogger.notice( self.do_show.__doc__ )
    elif option == "profile":
      if len( argss ) > 1:
        system = argss[0]
        del argss[0]
        component = argss[0]
        del argss[0]

        component = '%s_%s' % ( system, component )

        argDict = { '-s': None, '-h': self.host, '-id': None, '-it': '00:00', '-ed': None, '-et': '00:00' }
        key = None
        for arg in argss:
          if not key:
            key = arg
          else:
            argDict[ key ] = arg
            key = None

        size = None
        try:
          if argDict[ '-s' ]:
            size = int( argDict[ '-s' ] )
        except ValueError as _ve:
          self._errMsg( 'Argument \'size\' must be an integer' )
          return
        host = argDict[ '-h' ]
        initialDate = argDict[ '-id' ]
        initialTime = argDict[ '-it' ]
        endingDate = argDict[ '-ed' ]
        endingTime = argDict[ '-et' ]

        if initialDate:
          initialDate = '%s %s' % ( initialDate, initialTime )
        else:
          initialDate = ''
        if endingDate:
          endingDate = '%s %s' % ( endingDate, endingTime )
        else:
          endingDate = ''

        client = MonitoringClient()
        if size:
          result = client.getLimitedData( host, component, size )
        else:
          result = client.getDataForAGivenPeriod( host, component, initialDate, endingDate )

        if result[ 'OK' ]:
          text = ''
          headers = [result['Value'][0].keys()]
          for header in headers:
            text += str( header ).ljust( 15 )
          gLogger.notice( text )
          for record in result[ 'Value' ]:
            for metric in record.itervalues():
              text += str( metric ).ljust( 15 )
            gLogger.notice( text )
        else:
          self._errMsg( result[ 'Message' ] )
      else:
        gLogger.notice( self.do_show.__doc__ )
    else:
      gLogger.notice( "Unknown option:", option )
Пример #11
0
    def do_show(self, args):
        """ 
        Show list of components with various related information
        
        usage:
    
          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show errors [*|<system> <service|agent>] 
                             - show error count for the given component or all the components
                               in the last hour and day
    """

        argss = args.split()
        if not argss:
            print self.do_show.__doc__
            return

        option = argss[0]
        del argss[0]
        if option == 'software':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getSoftwareComponents()
            if not result['OK']:
                print " ERROR:", result['Message']
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'installed':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getInstalledComponents()
            if not result['OK']:
                print " ERROR:", result['Message']
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'setup':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getSetupComponents()
            if not result['OK']:
                print " ERROR:", result['Message']
            else:
                print
                pprint.pprint(result['Value'])
        elif option == 'status':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getOverallStatus()
            if not result['OK']:
                print "ERROR:", result['Message']
            else:
                rDict = result['Value']
                print
                print "   System", ' ' * 20, 'Name', ' ' * 15, 'Type', ' ' * 13, 'Setup    Installed   Runit    Uptime    PID'
                print '-' * 116
                for compType in rDict:
                    for system in rDict[compType]:
                        for component in rDict[compType][system]:
                            if rDict[compType][system][component]['Installed']:
                                print system.ljust(28), component.ljust(
                                    28), compType.lower()[:-1].ljust(7),
                                if rDict[compType][system][component]['Setup']:
                                    print 'SetUp'.rjust(12),
                                else:
                                    print 'NotSetup'.rjust(12),
                                if rDict[compType][system][component][
                                        'Installed']:
                                    print 'Installed'.rjust(12),
                                else:
                                    print 'NotInstalled'.rjust(12),
                                print str(rDict[compType][system][component]
                                          ['RunitStatus']).ljust(7),
                                print str(rDict[compType][system][component]
                                          ['Timeup']).rjust(7),
                                print str(rDict[compType][system][component]
                                          ['PID']).rjust(8),
                                print
        elif option == 'database' or option == 'databases':
            client = SystemAdministratorClient(self.host, self.port)
            if not InstallTools.mysqlPassword:
                InstallTools.mysqlPassword = "******"
            InstallTools.getMySQLPasswords()
            result = client.getDatabases(InstallTools.mysqlRootPwd)
            if not result['OK']:
                print "ERROR:", result['Message']
                return
            resultSW = client.getAvailableDatabases()
            if not resultSW['OK']:
                print "ERROR:", resultSW['Message']
                return

            sw = resultSW['Value']
            installed = result['Value']
            print
            for db in sw:
                if db in installed:
                    print db.rjust(25), ': Installed'
                else:
                    print db.rjust(25), ': Not installed'
            if not sw:
                print "No database found"
        elif option == 'mysql':
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getMySQLStatus()
            if not result['OK']:
                print "ERROR:", result['Message']
            elif result['Value']:
                print
                for par, value in result['Value'].items():
                    print par.rjust(28), ':', value
            else:
                print "No MySQL database found"
        elif option == "log":
            self.getLog(argss)
        elif option == "info":
            client = SystemAdministratorClient(self.host, self.port)
            result = client.getInfo()
            if not result['OK']:
                print "ERROR:", result['Message']
            else:
                print
                print "Setup:", result['Value']['Setup']
                print "DIRAC version:", result['Value']['DIRAC']
                if result['Value']['Extensions']:
                    for e, v in result['Value']['Extensions'].items():
                        print "%s version" % e, v
                print
        elif option == "errors":
            self.getErrors(argss)
        else:
            print "Unknown option:", option
Пример #12
0
class LemonAgent(AgentModule):
    def initialize(self):
        self.NON_CRITICAL = "NonCritical"
        self.CRITICAL = "Critical"
        self.FAILURE = "FAILURE"
        self.OK = "OK"

        self.setup = gConfig.getValue('/DIRAC/Setup', 'LHCb-Development')
        self.outputNonCritical = True
        #all components not present here will be treated as non critical

        self.admClient = SystemAdministratorClient('localhost')

        return S_OK()

    def execute(self):
        """ Main execution method
    """

        monitoredSetups = Operations().getValue('Lemon/MonitoredSetups',
                                                ['LHCb-Production'])
        self.monitoringEnabled = self.setup in monitoredSetups

        if not self.monitoringEnabled:
            self._log(
                "Framework/LemonAgent", self.NON_CRITICAL, self.OK,
                "Monitoring not enabled for this setup: " + self.setup +
                ". Exiting.")
            return S_OK()

        hostsInMaintenance = Operations().getValue('Lemon/HostsInMaintenance',
                                                   [])
        if gethostname() in hostsInMaintenance:
            self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK,
                      "I am in maintenance mode, exiting.")
            return S_OK()

        result = self.admClient.getOverallStatus()

        if not result or not result['OK']:
            self._log("Framework/LemonAgent", self.CRITICAL, self.FAILURE,
                      "Can not obtain result!!")
            return S_OK()

        services = result['Value']['Services']
        agents = result['Value']['Agents']
        self._processResults(services)
        self._processResults(agents)

        return S_OK()

    def _processResults(self, results):
        for system in results:
            for part in results[system]:
                component = results[system][part]
                componentName = system + "/" + part
                if component[
                        'Setup'] == True:  #we want to monitor only set up services and agents
                    critLevel = self._getCriticality(componentName)
                    if critLevel == self.NON_CRITICAL and self.outputNonCritical == False:
                        continue
                    if component['RunitStatus'] == 'Run':
                        self._log(componentName,
                                  self._getCriticality(componentName), self.OK,
                                  "Service/Agent running fine")
                    else:
                        self._log(componentName,
                                  self._getCriticality(componentName),
                                  self.FAILURE, "Service/Agent failure!")
        #    else:
        #      if component['Installed'] == True:
        #        print componentName + " is installed but not set up"

    def _getCriticality(self, component):
        #lets try to retrieve common criticality first
        criticality = Operations().getValue('Lemon/Criticalities/' + component,
                                            self.NON_CRITICAL)
        return criticality

    def _log(self, component, criticality, status, string):
        gLogger.info("LEMON " + criticality + " " + status + " " + component +
                     ": " + string + "\n")