def __init__(self, **kwargs):
        """ Constructor  
    """
        if 'hosts' in kwargs:
            self.__hosts = kwargs['hosts']
            del kwargs['hosts']
        else:
            result = Registry.getHosts()
            if result['OK']:
                self.__hosts = result['Value']
            else:
                self.__hosts = []
            # Excluded hosts
            if 'exclude' in kwargs:
                self.__hosts = list(set(self.__hosts) - set(kwargs['exclude']))

        # Ping the hosts to remove those that don't have a SystemAdministrator service
        sysAdminHosts = []
        for host in self.__hosts:
            client = SystemAdministratorClient(host)
            result = client.ping()
            if result['OK']:
                sysAdminHosts.append(host)
        self.__hosts = sysAdminHosts

        self.__kwargs = dict(kwargs)
        self.__pool = ThreadPool(len(self.__hosts))
        self.__resultDict = {}
  def do_stop( self, args ):
    """ Stop services or agents or database server

        usage:

          stop <system|*> <service|agent|*>
          stop mysql
    """
    argss = args.split()
    if argss[0] != 'mysql':
      system = argss[0]
      if system != '*':
        component = argss[1]
      else:
        component = '*'
      client = SystemAdministratorClient( self.host, self.port )
      result = client.stopComponent( system, component )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        if system != '*' and component != '*':
          print "\n%s_%s stopped successfully, runit status:\n" % ( system, component )
        else:
          print "\nComponents stopped successfully, runit status:\n"
        for comp in result['Value']:
          print comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus']
    else:
      print "Not yet implemented"
示例#3
0
    def restartHost(hostName):
        """
    Restart all systems and components of a host

    :param str hostName: name of the host you want to restart
    """
        host, port = parseHostname(hostName)

        gLogger.notice("Pinging %s ..." % host)

        client = SystemAdministratorClient(host, port)
        result = client.ping()
        if not result['OK']:
            gLogger.error("Could not connect to %s: %s" %
                          (host, result['Message']))
            return result
        gLogger.notice("Host %s is active" % host)

        gLogger.notice("Initiating restart of all systems and components")
        # This restart call will always return S_ERROR because of SystemAdministrator restart
        # Connection will be lost to the host
        result = client.restartComponent('*', '*')
        if result['Message'] == "Peer closed connection":
            gLogger.notice(
                "Restarted all systems on %s : connection to SystemAdministrator lost"
                % host)
            return S_OK(result['Message'])
        gLogger.error("Received unxpected message: %s" % result['Message'])
        return result
  def __init__( self, **kwargs ):
    """ Constructor  
    """
    if 'hosts' in kwargs:
      self.__hosts = kwargs['hosts']
      del kwargs['hosts']
    else:  
      result = Registry.getHosts()
      if result['OK']:
        self.__hosts = result['Value']
      else:
        self.__hosts = []
      # Excluded hosts
      if 'exclude' in kwargs:
        self.__hosts = list ( set( self.__hosts ) - set( kwargs[ 'exclude' ] ) )

    # Ping the hosts to remove those that don't have a SystemAdministrator service
    sysAdminHosts = []
    for host in self.__hosts:
      client = SystemAdministratorClient( host )
      result = client.ping()
      if result[ 'OK' ]:
        sysAdminHosts.append( host )
    self.__hosts = sysAdminHosts
      
    self.__kwargs = dict( kwargs )  
    self.__pool = ThreadPool( len( self.__hosts ) )  
    self.__resultDict = {}
示例#5
0
  def showHostErrors( self ):

    DN = getUserDN()
    group = getSelectedGroup()
    
    if not "host" in request.params:
      return { "success" : "false" , "error" : "Name of the host is missing or not defined" }
    host = str( request.params[ "host" ] )

    client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group )

    result = client.checkComponentLog( "*" )
    gLogger.debug( result )
    if not result[ "OK" ]:
      return { "success" : "false" , "error" : result[ "Message" ] }
    result = result[ "Value" ]
    
    callback = list()
    for key, value in result.items():
      system, component = key.split( "/" )
      value[ "System" ] = system
      value[ "Name" ] = component
      value[ "Host" ] = host
      callback.append( value )
    total = len( callback )

    return { "success" : "true" , "result" : callback , "total" : total }
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )
示例#7
0
  def do_add( self, args ):
    """
        Add new entity to the Configuration Service

        usage:

          add system <system> <instance>
    """
    argss = args.split()
    option = argss[0]
    del argss[0]
    if option == "instance" or option == "system":
      system = argss[0]
      instance = argss[1]
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      hostSetup = result['Value']['Setup']
      instanceName = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( hostSetup, system ), '' )
      if instanceName:
        if instanceName == instance:
          print "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup )
        else:
          self.__errMsg( "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup ) )
        return
      result = InstallTools.addSystemInstance( system, instance, hostSetup )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "%s system instance %s added successfully" % ( system, instance )
    else:
      print "Unknown option:", option
示例#8
0
  def showLog( self ):

    DN = getUserDN()
    group = getSelectedGroup()
    
    if not "host" in request.params:
      return "Name of the host is missing or not defined"
    host = str( request.params[ "host" ] )

    if not "system" in request.params:
      return "Name of the system is missing or not defined"
    system = str( request.params[ "system" ] )

    if not "component" in request.params:
      return "Name of component is missing or not defined"
    name = str( request.params[ "component" ] )

    client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group )

    result = client.getLogTail( system , name )
    gLogger.debug( result )
    if not result[ "OK" ]:
      return result[ "Message" ]
    result = result[ "Value" ]

    key = system + "_" + name
    if not key in result:
      return "%s key is absent in service response" % key
    log = result[ key ]

    return log.replace( "\n" , "<br>" )
  def do_add( self, args ):
    """
        Add new entity to the Configuration Service

        usage:

          add system <system> <instance>
    """
    argss = args.split()
    option = argss[0]
    del argss[0]
    if option == "instance" or option == "system":
      system = argss[0]
      instance = argss[1]
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      hostSetup = result['Value']['Setup']
      instanceName = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( hostSetup, system ), '' )
      if instanceName:
        if instanceName == instance:
          print "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup )
        else:
          self.__errMsg( "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup ) )
        return
      result = InstallTools.addSystemInstance( system, instance, hostSetup )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "%s system instance %s added successfully" % ( system, instance )
    else:
      print "Unknown option:", option
示例#10
0
  def submit( self ):

    """
    Returns flatten list of components (services, agents) installed on hosts
    returned by getHosts function
    """

    checkUserCredentials()
    DN = getUserDN()
    group = getSelectedGroup()

    callback = list()
    
    request = self.request()
    if not 'Hostname' in request:
      return { "success" : "false" , "error" : "Name of the host is absent" }
    
    host = request[ 'Hostname' ]
    client = SystemAdministratorClient( host , None , delegatedDN=DN ,
                                          delegatedGroup=group )
    result = client.getOverallStatus()
    gLogger.debug( "Result of getOverallStatus(): %s" % result )

    if not result[ "OK" ]:
      return { "success" : "false" , "error" : result[ "Message" ] }
    overall = result[ "Value" ]

    for record in self.flatten( overall ):
      record[ "Host" ] = host
      callback.append( record )

    return { "success" : "true" , "result" : callback }
    def do_stop(self, args):
        """ Stop services or agents or database server

        usage:

          stop <system|*> <service|agent|*>
          stop mysql
    """
        argss = args.split()
        if argss[0] != 'mysql':
            system = argss[0]
            if system != '*':
                component = argss[1]
            else:
                component = '*'
            client = SystemAdministratorClient(self.host, self.port)
            result = client.stopComponent(system, component)
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                if system != '*' and component != '*':
                    print "\n%s_%s stopped successfully, runit status:\n" % (
                        system, component)
                else:
                    print "\nComponents stopped successfully, runit status:\n"
                for comp in result['Value']:
                    print comp.rjust(
                        32), ':', result['Value'][comp]['RunitStatus']
        else:
            print "Not yet implemented"
 def do_revert( self, args ):
   """ Revert the last installed version of software to the previous one
   
       usage:
       
           revert
   """ 
   client = SystemAdministratorClient( self.host, self.port )
   result = client.revertSoftware()
   if not result['OK']:
     print "Error:", result['Message']
   else:
     print "Software reverted to", result['Value']  
示例#13
0
    def initialize(self):
        self.NON_CRITICAL = "NonCritical"
        self.CRITICAL = "Critical"
        self.FAILURE = "FAILURE"
        self.OK = "OK"

        self.setup = gConfig.getValue('/DIRAC/Setup', 'LHCb-Development')
        self.outputNonCritical = True
        #all components not present here will be treated as non critical

        self.admClient = SystemAdministratorClient('localhost')

        return S_OK()
 def do_revert(self, args):
     """ Revert the last installed version of software to the previous one
 
     usage:
     
         revert
 """
     client = SystemAdministratorClient(self.host, self.port)
     result = client.revertSoftware()
     if not result['OK']:
         print "Error:", result['Message']
     else:
         print "Software reverted to", result['Value']
示例#15
0
  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()
 def __executeClient(self, host, method, *parms, **kwargs):
     """ Execute RPC method on a given host 
 """
     client = SystemAdministratorClient(host, **self.__kwargs)
     result = getattr(client, method)(*parms, **kwargs)
     result['Host'] = host
     return result
示例#17
0
 def __executeClient(self, host, method, *parms, **kwargs):
     """Execute RPC method on a given host"""
     hostName = Registry.getHostOption(host, "Host", host)
     client = SystemAdministratorClient(hostName, **self.__kwargs)
     result = getattr(client, method)(*parms, **kwargs)
     result["Host"] = host
     return result
    def manageService(self, service, action):
        """ Manage services running on this machine

      usage:

        service <action> <serviceName>
    """
        client = ComponentMonitoringClient()
        result = client.getInstallations({'UninstallationTime': None}, {
            'System': 'External',
            'Module': service,
            'Type': 'External'
        }, {'HostName': self.host}, False)
        if not result['OK']:
            self._errMsg(result['Message'])
            return
        elif len(result['Value']) < 1:
            self._errMsg('%s is not installed' % (service))
            return

        client = SystemAdministratorClient(self.host, self.port)
        if action == 'start':
            result = client.startService(service)
        elif action == 'stop':
            result = client.stopService(service)
        elif action == 'restart':
            result = client.restartService(service)
        elif action == 'status':
            result = client.statusService(service)

        if not result['OK']:
            self._errMsg(result['Message'])
            return

        gLogger.notice(result['Value'])
示例#19
0
  def __actionHost( self ):

    """
    Restart all DIRAC components on a given host
    """

    if not "hostname" in request.params:
      return { "success" : "false" , "error" : "No hostname given" }
    hosts = request.params[ "hostname" ].split( "," )

    DN = getUserDN()
    group = getSelectedGroup()

    self.actionSuccess = list()
    self.actionFailed = list()

    for i in hosts:
      client = SystemAdministratorClient( str( i ) , None , delegatedDN=DN ,
                                          delegatedGroup=group )
      if self.action is "restart":
        result = client.restartComponent( str( "*" ) , str( "*" ) )
      elif self.action is "revert":
        result = client.revertSoftware()
      else:
        error = i + ": Action %s is not defined" % self.action
        self.actionFailed.append( error )
        continue

      gLogger.always( result )

      if not result[ "OK" ]:
        if result[ "Message" ].find( "Unexpected EOF" ) > 0:
          msg = "Signal 'Unexpected EOF' received. Most likely DIRAC components"
          msg = i + ": " + msg + " were successfully restarted."
          self.actionSuccess.append( msg )
          continue
        error = i + ": " + result[ "Message" ]
        self.actionFailed.append( error )
        gLogger.error( error )
      else:
        gLogger.info( result[ "Value" ] )
        self.actionSuccess.append( i )
      
    self.prefix = "Host"
    return self.__aftermath()
示例#20
0
    def web_getHostLog(self):

        userData = self.getSessionData()

        DN = str(userData["user"]["DN"])
        group = str(userData["user"]["group"])

        if not "host" in self.request.arguments:
            self.finish({
                "success": "false",
                "error": "Name of the host is missing or not defined"
            })
            return
        host = str(self.request.arguments["host"][0])

        if not "system" in self.request.arguments:
            self.finish({
                "success": "false",
                "error": "Name of the system is missing or not defined"
            })
            return
        system = str(self.request.arguments["system"][0])

        if not "component" in self.request.arguments:
            self.finish({
                "success": "false",
                "error": "Name of component is missing or not defined"
            })
            return

        name = str(self.request.arguments["component"][0])

        client = SystemAdministratorClient(host,
                                           None,
                                           delegatedDN=DN,
                                           delegatedGroup=group)

        result = yield self.threadTask(client.getLogTail, system, name)
        gLogger.debug(result)

        if not result["OK"]:
            self.finish({"success": "false", "error": result["Message"]})
            return

        result = result["Value"]

        key = system + "_" + name
        if not key in result:
            self.finish({
                "success": "false",
                "error": "%s key is absent in service response" % key
            })
            return

        log = result[key]

        self.finish({"success": "true", "result": log.replace("\n", "<br>")})
示例#21
0
  def do_exec( self, args ):
    """ Execute a shell command on the remote host and get back the output

        usage:

          exec <cmd> [<arguments>]
    """
    client = SystemAdministratorClient( self.host, self.port )
    result = client.executeCommand( args )
    if not result['OK']:
      self.__errMsg( result['Message'] )
    status, output, error = result['Value']
    print
    for line in output.split( '\n' ):
      print line
    if error:
      self.__errMsg( status )
      for line in error.split( '\n' ):
        print line
 def do_update(self, args):
     """ Update the software on the target host to a given version
 
     usage:
       
       update <version> 
 """
     argss = args.split()
     version = argss[0]
     client = SystemAdministratorClient(self.host, self.port)
     print "Software update can take a while, please wait ..."
     result = client.updateSoftware(version)
     if not result['OK']:
         self.__errMsg("Failed to update the software")
         print result['Message']
     else:
         print "Software successfully updated."
         print "You should restart the services to use the new software version."
         print "Think of updating /Operation/<vo>/<setup>/Versions section in the CS"
 def do_exec(self, args):
     """ Execute a shell command on the remote host and get back the output
 
     usage:
     
       exec <cmd> [<arguments>]
 """
     client = SystemAdministratorClient(self.host, self.port)
     result = client.executeCommand(args)
     if not result['OK']:
         self.__errMsg(result['Message'])
     status, output, error = result['Value']
     print
     for line in output.split('\n'):
         print line
     if error:
         self.__errMsg(status)
         for line in error.split('\n'):
             print line
示例#24
0
  def do_update( self, args ):
    """ Update the software on the target host to a given version

        usage:

          update <version>
    """
    argss = args.split()
    version = argss[0]
    client = SystemAdministratorClient( self.host, self.port )
    print "Software update can take a while, please wait ..."
    result = client.updateSoftware( version )
    if not result['OK']:
      self.__errMsg( "Failed to update the software" )
      print result['Message']
    else:
      print "Software successfully updated."
      print "You should restart the services to use the new software version."
      print "Think of updating /Operation/<vo>/<setup>/Versions section in the CS"
    def do_cd(self, args):
        """ Change the current working directory on the target host
    
        Usage:
          cd <dirpath>
    """
        argss = args.split()

        if len(argss) == 0:
            # Return to $HOME
            if self.homeDir:
                self.previous_cwd = self.cwd
                self.cwd = self.homeDir
            else:
                client = SystemAdministratorClient(self.host, self.port)
                command = 'echo $HOME'
                result = client.executeCommand(command)
                if not result['OK']:
                    self.__errMsg(result['Message'])
                    return
                status, output, _error = result['Value']
                if not status and output:
                    self.homeDir = output.strip()
                    self.previous_cwd = self.cwd
                    self.cwd = self.homeDir
            self.prompt = '[%s:%s]> ' % (self.host, self.cwd)
            return

        newPath = argss[0]
        if newPath == '-':
            if self.previous_cwd:
                cwd = self.cwd
                self.cwd = self.previous_cwd
                self.previous_cwd = cwd
        elif newPath.startswith('/'):
            self.previous_cwd = self.cwd
            self.cwd = newPath
        else:
            newPath = self.cwd + '/' + newPath
            self.previous_cwd = self.cwd
            self.cwd = os.path.normpath(newPath)
        self.prompt = '[%s:%s]> ' % (self.host, self.cwd)
 def do_cd( self, args ):    
   """ Change the current working directory on the target host
   
       Usage:
         cd <dirpath>
   """
   argss = args.split()
   
   if len( argss ) == 0:
     # Return to $HOME
     if self.homeDir:
       self.previous_cwd = self.cwd
       self.cwd = self.homeDir
     else:  
       client = SystemAdministratorClient( self.host, self.port )
       command = 'echo $HOME'
       result = client.executeCommand( command )
       if not result['OK']:
         self.__errMsg( result['Message'] )
         return
       status, output, _error = result['Value']
       if not status and output:
         self.homeDir = output.strip()
         self.previous_cwd = self.cwd
         self.cwd = self.homeDir
     self.prompt = '[%s:%s]> ' % ( self.host, self.cwd )  
     return
       
   newPath = argss[0]
   if newPath == '-':
     if self.previous_cwd:
       cwd = self.cwd
       self.cwd = self.previous_cwd
       self.previous_cwd = cwd
   elif newPath.startswith( '/' ):
     self.previous_cwd = self.cwd
     self.cwd = newPath
   else:
     newPath = self.cwd + '/' + newPath
     self.previous_cwd = self.cwd
     self.cwd = os.path.normpath( newPath )  
   self.prompt = '[%s:%s]> ' % ( self.host, self.cwd )  
    def getErrors(self, argss):
        """ Get and print out errors from the logs of specified components
    """
        component = ''
        if len(argss) < 1:
            component = '*'
        else:
            system = argss[0]
            if system == "*":
                component = '*'
            else:
                if len(argss) < 2:
                    print
                    print self.do_show.__doc__
                    return
                comp = argss[1]
                component = '/'.join([system, comp])

        client = SystemAdministratorClient(self.host, self.port)
        result = client.checkComponentLog(component)
        if not result['OK']:
            self.__errMsg(result['Message'])
        else:
            fields = [
                'System', 'Component', 'Last hour', 'Last day', 'Last error'
            ]
            records = []
            for cname in result['Value']:
                system, component = cname.split('/')
                errors_1 = result['Value'][cname]['ErrorsHour']
                errors_24 = result['Value'][cname]['ErrorsDay']
                lastError = result['Value'][cname]['LastError']
                lastError.strip()
                if len(lastError) > 80:
                    lastError = lastError[:80] + '...'
                records.append([
                    system, component,
                    str(errors_1),
                    str(errors_24), lastError
                ])
            records.sort()
            printTable(fields, records)
  def do_uninstall( self, args ):
    """
        Uninstall DIRAC component

        usage:

          uninstall <system> <component>
    """
    argss = args.split()
    if not argss or len(argss) != 2:
      print self.do_uninstall.__doc__
      return
    
    system,component = argss
    client = SystemAdministratorClient( self.host, self.port )
    result = client.uninstallComponent( system, component )
    if not result['OK']:
      print "Error:", result['Message']
    else:
      print "Successfully uninstalled %s/%s" % (system,component)  
    def do_uninstall(self, args):
        """
        Uninstall DIRAC component

        usage:

          uninstall <system> <component>
    """
        argss = args.split()
        if not argss or len(argss) != 2:
            print self.do_uninstall.__doc__
            return

        system, component = argss
        client = SystemAdministratorClient(self.host, self.port)
        result = client.uninstallComponent(system, component)
        if not result['OK']:
            print "Error:", result['Message']
        else:
            print "Successfully uninstalled %s/%s" % (system, component)
  def do_update( self, args ):
    """ Update the software on the target host to a given version

        usage:

          update <version> [ -r <rootPath> ] [ -g <lcgVersion> ]

              where rootPath - path to the DIRAC installation
                    lcgVersion - version of the LCG bindings to install
    """
    try:
      argss = args.split()
      version = argss[0]
      rootPath = ''
      lcgVersion = ''
      del argss[0]

      while len( argss ) > 0:
        if argss[0] == '-r':
          rootPath = argss[1]
          del argss[0]
          del argss[0]
        elif argss[0] == '-g':
          lcgVersion = argss[1]
          del argss[0]
          del argss[0]
    except Exception as x:
      gLogger.notice( "ERROR: wrong input:", str( x ) )
      gLogger.notice( self.do_update.__doc__ )
      return

    client = SystemAdministratorClient( self.host, self.port )
    gLogger.notice( "Software update can take a while, please wait ..." )
    result = client.updateSoftware( version, rootPath, lcgVersion, timeout = 300 )
    if not result['OK']:
      self._errMsg( "Failed to update the software" )
      gLogger.notice( result['Message'] )
    else:
      gLogger.notice( "Software successfully updated." )
      gLogger.notice( "You should restart the services to use the new software version." )
      gLogger.notice( "Think of updating /Operations/<vo>/<setup>/Pilot/Versions section in the CS" )
  def do_update( self, args ):
    """ Update the software on the target host to a given version

        usage:

          update <version> [ -r <rootPath> ] [ -g <lcgVersion> ]

              where rootPath - path to the DIRAC installation
                    lcgVersion - version of the LCG bindings to install
    """
    try:
      argss = args.split()
      version = argss[0]
      rootPath = ''
      lcgVersion = ''
      del argss[0]

      while len( argss ) > 0:
        if argss[0] == '-r':
          rootPath = argss[1]
          del argss[0]
          del argss[0]
        elif argss[0] == '-g':
          lcgVersion = argss[1]
          del argss[0]
          del argss[0]
    except Exception as x:
      gLogger.notice( "ERROR: wrong input:", str( x ) )
      gLogger.notice( self.do_update.__doc__ )
      return

    client = SystemAdministratorClient( self.host, self.port )
    gLogger.notice( "Software update can take a while, please wait ..." )
    result = client.updateSoftware( version, rootPath, lcgVersion, timeout = 300 )
    if not result['OK']:
      self._errMsg( "Failed to update the software" )
      gLogger.notice( result['Message'] )
    else:
      gLogger.notice( "Software successfully updated." )
      gLogger.notice( "You should restart the services to use the new software version." )
      gLogger.notice( "Think of updating /Operations/<vo>/<setup>/Pilot/Versions section in the CS" )
示例#32
0
  def do_restart( self, args ):
    """ Restart services or agents or database server

        usage:

          restart <system|*> <service|agent|*>
          restart mysql
    """
    if not args:
      gLogger.notice( self.do_restart.__doc__ )
      return

    argss = args.split()
    option = argss[0]
    del argss[0]
    if option != 'mysql':
      if option != "*":
        if len( argss ) < 1:
          gLogger.notice( self.do_restart.__doc__ )
          return
      system = option
      if system != '*':
        component = argss[0]
      else:
        component = '*'
      client = SystemAdministratorClient( self.host, self.port )
      result = client.restartComponent( system, component )
      if not result['OK']:
        if system == '*':
          gLogger.notice( "All systems are restarted, connection to SystemAdministrator is lost" )
        else:
          self.__errMsg( result['Message'] )
      else:
        if system != '*' and component != '*':
          gLogger.notice( "\n%s_%s started successfully, runit status:\n" % ( system, component ) )
        else:
          gLogger.notice( "\nComponents started successfully, runit status:\n" )
        for comp in result['Value']:
          gLogger.notice( ( comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus'] ) )
    else:
      gLogger.notice( "Not yet implemented" )
  def do_restart( self, args ):
    """ Restart services or agents or database server

        usage:

          restart <system|*> <service|agent|*>
          restart mysql
    """
    if not args:
      gLogger.notice( self.do_restart.__doc__ )
      return

    argss = args.split()
    option = argss[0]
    del argss[0]
    if option != 'mysql':
      if option != "*":
        if len( argss ) < 1:
          gLogger.notice( self.do_restart.__doc__ )
          return
      system = option
      if system != '*':
        component = argss[0]
      else:
        component = '*'
      client = SystemAdministratorClient( self.host, self.port )
      result = client.restartComponent( system, component )
      if not result['OK']:
        if system == '*':
          gLogger.notice( "All systems are restarted, connection to SystemAdministrator is lost" )
        else:
          self.__errMsg( result['Message'] )
      else:
        if system != '*' and component != '*':
          gLogger.notice( "\n%s_%s started successfully, runit status:\n" % ( system, component ) )
        else:
          gLogger.notice( "\nComponents started successfully, runit status:\n" )
        for comp in result['Value']:
          gLogger.notice( ( comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus'] ) )
    else:
      gLogger.notice( "Not yet implemented" )
  def do_exec( self, args ):
    """ Execute a shell command on the remote host and get back the output

        usage:

          exec <cmd> [<arguments>]
    """
    client = SystemAdministratorClient( self.host, self.port )
    command = 'cd %s;' % self.cwd + args
    result = client.executeCommand( command )
    if not result['OK']:
      self.__errMsg( result['Message'] )
      return
    status, output, error = result['Value']
    gLogger.notice( '' )
    for line in output.split( '\n' ):
      gLogger.notice( line )
    if error:
      self.__errMsg( status )
      for line in error.split( '\n' ):
        gLogger.notice( line )
示例#35
0
  def do_exec( self, args ):
    """ Execute a shell command on the remote host and get back the output

        usage:

          exec <cmd> [<arguments>]
    """
    client = SystemAdministratorClient( self.host, self.port )
    command = 'cd %s;' % self.cwd + args
    result = client.executeCommand( command )
    if not result['OK']:
      self.__errMsg( result['Message'] )
      return
    status, output, error = result['Value']
    gLogger.notice( '' )
    for line in output.split( '\n' ):
      gLogger.notice( line )
    if error:
      self.__errMsg( status )
      for line in error.split( '\n' ):
        gLogger.notice( line )
示例#36
0
  def initialize( self ):
    self.NON_CRITICAL = "NonCritical"
    self.CRITICAL = "Critical"
    self.FAILURE = "FAILURE"
    self.OK = "OK"

    self.setup = gConfig.getValue('/DIRAC/Setup','LHCb-Development')
    self.outputNonCritical = True
    #all components not present here will be treated as non critical

    self.admClient = SystemAdministratorClient('localhost')

    return S_OK()
    def do_start(self, args):
        """ Start services or agents or database server

        usage:

          start <system|*> <service|agent|*>
          start mysql
    """
        argss = args.split()
        if len(argss) < 2:
            print self.do_start.__doc__
            return
        option = argss[0]
        del argss[0]

        if option != 'mysql':
            if len(argss) < 1:
                print self.do_start.__doc__
                return
            system = option
            if system != '*':
                component = argss[0]
            else:
                component = '*'
            client = SystemAdministratorClient(self.host, self.port)
            result = client.startComponent(system, component)
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                if system != '*' and component != '*':
                    print "\n%s_%s started successfully, runit status:\n" % (
                        system, component)
                else:
                    print "\nComponents started successfully, runit status:\n"
                for comp in result['Value']:
                    print comp.rjust(
                        32), ':', result['Value'][comp]['RunitStatus']
        else:
            print "Not yet implemented"
示例#38
0
    def updateHost(hostName, version):
        """
    Apply update to specific host

    :param str hostName: name of the host you want to update
    :param str version: version vArBpC you want to update to
    """
        host, port = parseHostname(hostName)

        client = SystemAdministratorClient(host, port)
        result = client.ping()
        if not result['OK']:
            gLogger.error("Cannot connect to %s" % host)
            return result

        gLogger.notice(
            "Initiating software update of %s, this can take a while, please be patient ..."
            % host)
        result = client.updateSoftware(version, '', '', timeout=600)
        if not result['OK']:
            return result
        return S_OK()
  def getLog( self, argss ):
    """ Get the tail of the log file of the given component
    """
    if len( argss ) < 2:
      print
      print self.do_show.__doc__
      return

    system = argss[0]
    component = argss[1]
    nLines = 40
    if len( argss ) > 2:
      nLines = int( argss[2] )
    client = SystemAdministratorClient( self.host, self.port )
    result = client.getLogTail( system, component, nLines )
    if not result['OK']:
      self.__errMsg( result['Message'] )
    elif result['Value']:
      for line in result['Value']['_'.join( [system, component] )].split( '\n' ):
        print '   ', line

    else:
      print "No logs found"
示例#40
0
  def getLog( self, argss ):
    """ Get the tail of the log file of the given component
    """
    if len( argss ) < 2:
      print
      print self.do_show.__doc__
      return

    system = argss[0]
    component = argss[1]
    nLines = 40
    if len( argss ) > 2:
      nLines = int( argss[2] )
    client = SystemAdministratorClient( self.host, self.port )
    result = client.getLogTail( system, component, nLines )
    if not result['OK']:
      self.__errMsg( result['Message'] )
    elif result['Value']:
      for line in result['Value']['_'.join( [system, component] )].split( '\n' ):
        print '   ', line

    else:
      print "No logs found"
  def do_start( self, args ):
    """ Start services or agents or database server

        usage:

          start <system|*> <service|agent|*>
          start mysql
    """
    argss = args.split()
    if len( argss ) < 2:
      print self.do_start.__doc__
      return
    option = argss[0]
    del argss[0]

    if option != 'mysql':
      if len( argss ) < 1:
        print self.do_start.__doc__
        return
      system = option
      if system != '*':
        component = argss[0]
      else:
        component = '*'
      client = SystemAdministratorClient( self.host, self.port )
      result = client.startComponent( system, component )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        if system != '*' and component != '*':
          print "\n%s_%s started successfully, runit status:\n" % ( system, component )
        else:
          print "\nComponents started successfully, runit status:\n"
        for comp in result['Value']:
          print comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus']
    else:
      print "Not yet implemented"
  def getErrors( self, argss ):
    """ Get and print out errors from the logs of specified components
    """
    component = ''
    if len( argss ) < 1:
      component = '*'
    else:
      system = argss[0]
      if system == "*":
        component = '*'
      else:
        if len( argss ) < 2:
          print
          print self.do_show.__doc__
          return
        comp = argss[1]
        component = '/'.join( [system, comp] )

    client = SystemAdministratorClient( self.host, self.port )
    result = client.checkComponentLog( component )
    if not result['OK']:
      self.__errMsg( result['Message'] )
    else:
      fields = ['System', 'Component', 'Last hour', 'Last day', 'Last error']
      records = []
      for cname in result['Value']:
        system, component = cname.split( '/' )
        errors_1 = result['Value'][cname]['ErrorsHour']
        errors_24 = result['Value'][cname]['ErrorsDay']
        lastError = result['Value'][cname]['LastError']
        lastError.strip()
        if len( lastError ) > 80:
          lastError = lastError[:80] + '...'
        records.append( [system, component, str( errors_1 ), str( errors_24 ), lastError] )
      records.sort()
      printTable( fields, records )
示例#43
0
    def web_getHostData(self):
        """
    Returns flatten list of components (services, agents) installed on hosts
    returned by getHosts function
    """

        # checkUserCredentials()
        userData = self.getSessionData()

        DN = str(userData["user"]["DN"])
        group = str(userData["user"]["group"])

        callback = list()

        if not (self.request.arguments.has_key("hostname")
                and self.request.arguments["hostname"][0]):
            self.finish({
                "success": "false",
                "error": "Name of the host is absent"
            })
            return

        host = self.request.arguments["hostname"][0]
        client = SystemAdministratorClient(host,
                                           None,
                                           delegatedDN=DN,
                                           delegatedGroup=group)
        result = yield self.threadTask(client.getOverallStatus)
        gLogger.debug("Result of getOverallStatus(): %s" % result)

        if not result["OK"]:
            self.finish({"success": "false", "error": result["Message"]})
            return

        overall = result["Value"]

        for record in self.flatten(overall):
            record["Host"] = host
            callback.append(record)

        self.finish({"success": "true", "result": callback})
示例#44
0
    def web_getHostErrors(self):

        userData = self.getSessionData()

        DN = str(userData["user"]["DN"])
        group = str(userData["user"]["group"])

        if not "host" in self.request.arguments:
            self.finish({
                "success": "false",
                "error": "Name of the host is missing or not defined"
            })
            return

        host = str(self.request.arguments["host"][0])

        client = SystemAdministratorClient(host,
                                           None,
                                           delegatedDN=DN,
                                           delegatedGroup=group)

        result = yield self.threadTask(client.checkComponentLog, "*")

        gLogger.debug(result)
        if not result["OK"]:
            self.finish({"success": "false", "error": result["Message"]})
            return
        result = result["Value"]

        callback = list()
        for key, value in result.items():
            system, component = key.split("/")
            value["System"] = system
            value["Name"] = component
            value["Host"] = host
            callback.append(value)
        total = len(callback)

        self.finish({"success": "true", "result": callback, "total": total})
 def __getClient(self):
     return SystemAdministratorClient(self.host, self.port)
  def do_uninstall( self, args ):
    """
        Uninstall DIRAC component

        usage:

          uninstall db <database>
          uninstall <-f ForceLogUninstall> <system> <component>
    """
    argss = args.split()
    if not argss:
      gLogger.notice( self.do_uninstall.__doc__ )
      return

    # Retrieve user uninstalling the component
    result = getProxyInfo()
    if not result[ 'OK' ]:
      self.__errMsg( result[ 'Message'] )
    user = result[ 'Value' ][ 'username' ]

    option = argss[0]
    if option == 'db':
      component = argss[1]
      client = SystemAdministratorClient( self.host, self.port )

      result = client.getHostInfo()
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
        return
      else:
        cpu = result[ 'Value' ][ 'CPUModel' ]
      hostname = self.host
      result = client.getAvailableDatabases()
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
        return
      system = result[ 'Value' ][ component ][ 'System' ]
      result = MonitoringUtilities.monitorUninstallation( system , component, hostname = hostname, cpu = cpu )
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
        return

      result = client.uninstallDatabase( component )
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
      else:
        gLogger.notice( "Successfully uninstalled %s" % ( component ) )
    else:
      if option == '-f':
        force = True
        del argss[0]
      else:
        force = False

      if len( argss ) != 2:
        gLogger.notice( self.do_uninstall.__doc__ )
        return

      system, component = argss
      client = SystemAdministratorClient( self.host, self.port )

      monitoringClient = ComponentMonitoringClient()
      result = monitoringClient.getInstallations( { 'Instance': component, 'UnInstallationTime': None },
                                                  { 'System': system },
                                                  { 'HostName': self.host }, True )
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
        return
      if len( result[ 'Value' ] ) < 1:
        self.__errMsg( "Given component does not exist" )
        return
      if len( result[ 'Value' ] ) > 1:
        self.__errMsg( "Too many components match" )
        return

      removeLogs = False
      if force:
        removeLogs = True
      else:
        if result[ 'Value' ][0][ 'Component' ][ 'Type' ] in self.runitComponents:
          result = promptUser( 'Remove logs?', ['y', 'n'], 'n' )
          if result[ 'OK' ]:
            removeLogs = result[ 'Value' ] == 'y'

      result = client.uninstallComponent( system, component, removeLogs )
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
      else:
        gLogger.notice( "Successfully uninstalled %s/%s" % ( system, component ) )

      result = client.getHostInfo()
      if not result[ 'OK' ]:
        self.__errMsg( result[ 'Message' ] )
        return
      else:
        cpu = result[ 'Value' ][ 'CPUModel' ]
      hostname = self.host
      result = MonitoringUtilities.monitorUninstallation( system, component, hostname = hostname, cpu = cpu )
      if not result[ 'OK' ]:
        return result
  def do_show( self, args ):
    """ 
        Show list of components with various related information
        
        usage:
    
          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show errors [*|<system> <service|agent>] 
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      print self.do_show.__doc__
      return

    option = argss[0]
    del argss[0]
    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        print " ERROR:", result['Message']
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        print "ERROR:", result['Message']
      else:
        rDict = result['Value']
        print
        print "   System", ' '*20, 'Name', ' '*15, 'Type', ' '*13, 'Setup    Installed   Runit    Uptime    PID'
        print '-' * 116
        for compType in rDict:
          for system in rDict[compType]:
            for component in rDict[compType][system]:
              if rDict[compType][system][component]['Installed']:
                print  system.ljust( 28 ), component.ljust( 28 ), compType.lower()[:-1].ljust( 7 ),
                if rDict[compType][system][component]['Setup']:
                  print 'SetUp'.rjust( 12 ),
                else:
                  print 'NotSetup'.rjust( 12 ),
                if rDict[compType][system][component]['Installed']:
                  print 'Installed'.rjust( 12 ),
                else:
                  print 'NotInstalled'.rjust( 12 ),
                print str( rDict[compType][system][component]['RunitStatus'] ).ljust( 7 ),
                print str( rDict[compType][system][component]['Timeup'] ).rjust( 7 ),
                print str( rDict[compType][system][component]['PID'] ).rjust( 8 ),
                print
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        print "ERROR:", result['Message']
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        print "ERROR:", resultSW['Message']
        return

      sw = resultSW['Value']
      installed = result['Value']
      print
      for db in sw:
        if db in installed:
          print db.rjust( 25 ), ': Installed'
        else:
          print db.rjust( 25 ), ': Not installed'
      if not sw:
        print "No database found"
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        print "ERROR:", result['Message']
      elif result['Value']:
        print
        for par, value in result['Value'].items():
          print par.rjust( 28 ), ':', value
      else:
        print "No MySQL database found"
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        print "ERROR:", result['Message']
      else:
        print
        print "Setup:", result['Value']['Setup']
        print "DIRAC version:", result['Value']['DIRAC']
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            print "%s version" % e, v
        print
    elif option == "errors":    
      self.getErrors( argss )
    else:
      print "Unknown option:", option
示例#48
0
class MonitorAgents(AgentModule):
  """MonitorAgents class."""

  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()

  def logError(self, errStr, varMsg=''):
    """Append errors to a list, which is sent in email notification."""
    self.log.error(errStr, varMsg)
    self.errors.append(errStr + " " + varMsg)

  def beginExecution(self):
    """Reload the configurations before every cycle."""
    self.setup = self.am_getOption("Setup", self.setup)
    self.enabled = self.am_getOption("EnableFlag", self.enabled)
    self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents)
    self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors)
    self.restartServices = self.am_getOption("RestartServices", self.restartServices)
    self.diracLocation = os.environ.get("DIRAC", self.diracLocation)
    self.addressTo = self.am_getOption('MailTo', self.addressTo)
    self.addressFrom = self.am_getOption('MailFrom', self.addressFrom)
    self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents)
    self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs)

    self.csAPI = CSAPI()

    res = self.getRunningInstances(instanceType='Agents')
    if not res["OK"]:
      return S_ERROR("Failure to get running agents")
    self.agents = res["Value"]

    res = self.getRunningInstances(instanceType='Executors')
    if not res["OK"]:
      return S_ERROR("Failure to get running executors")
    self.executors = res["Value"]

    res = self.getRunningInstances(instanceType='Services')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]

    self.accounting.clear()
    return S_OK()

  def sendNotification(self):
    """Send email notification about changes done in the last cycle."""
    if not(self.errors or self.accounting):
      return S_OK()

    emailBody = ""
    rows = []
    for instanceName, val in self.accounting.iteritems():
      rows.append([[instanceName],
                   [val.get('Treatment', 'No Treatment')],
                   [str(val.get('LogAge', 'Not Relevant'))]])

    if rows:
      columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
      emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ')

    if self.errors:
      emailBody += "\n\nErrors:"
      emailBody += "\n".join(self.errors)

    self.log.notice("Sending Email:\n" + emailBody)
    for address in self.addressTo:
      res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False)
      if not res['OK']:
        self.log.error("Failure to send Email notification to ", address)
        continue

    self.errors = []
    self.accounting.clear()

    return S_OK()

  def getRunningInstances(self, instanceType='Agents', runitStatus='Run'):
    """Return a dict of running agents, executors or services.

    Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

    :param str instanceType: 'Agents', 'Executors', 'Services'
    :param str runitStatus: Return only those instances with given RunitStatus or 'All'
    :returns: Dictionary of running instances
    """
    res = self.sysAdminClient.getOverallStatus()
    if not res["OK"]:
      self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"])
      return res

    val = res['Value'][instanceType]
    runningAgents = defaultdict(dict)
    for system, agents in val.iteritems():
      for agentName, agentInfo in agents.iteritems():
        if agentInfo['Setup'] and agentInfo['Installed']:
          if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus:
            continue
          confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName)
          for option, default in (('PollingTime', HOUR), ('Port', None)):
            optPath = os.path.join(confPath, option)
            runningAgents[agentName][option] = gConfig.getValue(optPath, default)
          runningAgents[agentName]["LogFileLocation"] = \
              os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current')
          runningAgents[agentName]["PID"] = agentInfo["PID"]
          runningAgents[agentName]['Module'] = agentInfo['Module']
          runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus']
          runningAgents[agentName]['System'] = system

    return S_OK(runningAgents)

  def on_terminate(self, agentName, process):
    """Execute callback when a process terminates gracefully."""
    self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid))

  def execute(self):
    """Execute checks for agents, executors, services."""
    for instanceType in ('executor', 'agent', 'service'):
      for name, options in getattr(self, instanceType + 's').iteritems():
        # call checkAgent, checkExecutor, checkService
        res = getattr(self, 'check' + instanceType.capitalize())(name, options)
        if not res['OK']:
          self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message']))

    res = self.componentControl()
    if not res['OK']:
      if "Stopped does not exist" not in res['Message'] and \
         "Running does not exist" not in res['Message']:
        self.logError("Failure to control components", res['Message'])

    if not self.errors:
      res = self.checkURLs()
      if not res['OK']:
        self.logError("Failure to check URLs", res['Message'])
    else:
      self.logError('Something was wrong before, not checking URLs this time')

    self.sendNotification()

    if self.errors:
      return S_ERROR("Error during this cycle, check log")

    return S_OK()

  @staticmethod
  def getLastAccessTime(logFileLocation):
    """Return the age of log file."""
    lastAccessTime = 0
    try:
      lastAccessTime = os.path.getmtime(logFileLocation)
      lastAccessTime = datetime.fromtimestamp(lastAccessTime)
    except OSError as e:
      return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e))

    now = datetime.now()
    age = now - lastAccessTime
    return S_OK(age)

  def restartInstance(self, pid, instanceName, enabled):
    """Kill a process which is then restarted automatically."""
    if not (self.enabled and enabled):
      self.log.info("Restarting is disabled, please restart %s manually" % instanceName)
      self.accounting[instanceName]["Treatment"] = "Please restart it manually"
      return S_OK(NO_RESTART)

    try:
      agentProc = psutil.Process(int(pid))
      processesToTerminate = agentProc.children(recursive=True)
      processesToTerminate.append(agentProc)

      for proc in processesToTerminate:
        proc.terminate()

      _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5,
                                       callback=partial(self.on_terminate, instanceName))
      for proc in alive:
        self.log.info("Forcefully killing process %s" % proc.pid)
        proc.kill()

      return S_OK()

    except psutil.Error as err:
      self.logError("Exception occurred in terminating processes", "%s" % err)
      return S_ERROR()

  def checkService(self, serviceName, options):
    """Ping the service, restart if the ping does not respond."""
    url = self._getURL(serviceName, options)
    self.log.info("Pinging service", url)
    pingRes = Client().ping(url=url)
    if not pingRes['OK']:
      self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message']))
      res = self.restartInstance(int(options['PID']), serviceName, self.restartServices)
      if not res["OK"]:
        return res
      elif res['OK'] and res['Value'] != NO_RESTART:
        self.accounting[serviceName]["Treatment"] = "Successfully Restarted"
        self.log.info("Agent %s has been successfully restarted" % serviceName)
    self.log.info("Service responded OK")
    return S_OK()

  def checkAgent(self, agentName, options):
    """Check the age of agent's log file, if it is too old then restart the agent."""
    pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID']
    self.log.info("Checking Agent: %s" % agentName)
    self.log.info("Polling Time: %s" % pollingTime)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES)))

    maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
    if age.seconds < maxLogAge:
      return S_OK()

    self.log.info("Current log file is too old for Agent %s" % agentName)
    self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

    res = self.restartInstance(int(pid), agentName, self.restartAgents)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[agentName]["Treatment"] = "Successfully Restarted"
      self.log.info("Agent %s has been successfully restarted" % agentName)

    return S_OK()

  def checkExecutor(self, executor, options):
    """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
    currentLogLocation = options['LogFileLocation']
    pid = options['PID']
    self.log.info("Checking executor: %s" % executor)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES)))

    if age.seconds < 2 * HOUR:
      return S_OK()

    self.log.info("Current log file is too old for Executor %s" % executor)
    self.accounting[executor]["LogAge"] = age.seconds / MINUTES

    res = self.checkForCheckingJobs(executor)
    if not res['OK']:
      return res
    if res['OK'] and res['Value'] == NO_CHECKING_JOBS:
      self.accounting.pop(executor, None)
      return S_OK(NO_RESTART)

    res = self.restartInstance(int(pid), executor, self.restartExecutors)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[executor]["Treatment"] = "Successfully Restarted"
      self.log.info("Executor %s has been successfully restarted" % executor)

    return S_OK()

  def checkForCheckingJobs(self, executorName):
    """Check if there are checking jobs with the **executorName** as current MinorStatus."""
    attrDict = {'Status': 'Checking', 'MinorStatus': executorName}

    # returns list of jobs IDs
    resJobs = self.jobMonClient.getJobs(attrDict)
    if not resJobs['OK']:
      self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message']))
      return resJobs
    if resJobs['Value']:
      self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName))
      return S_OK(CHECKING_JOBS)
    self.log.info("Found no jobs in 'Checking' status for %s" % executorName)
    return S_OK(NO_CHECKING_JOBS)

  def componentControl(self):
    """Monitor and control component status as defined in the CS.

    Check for running and stopped components and ensure they have the proper status as defined in the CS
    Registry/Hosts/_HOST_/[Running|Stopped] sections

    :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
       :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
    """
    # get the current status of the components

    resCurrent = self._getCurrentComponentStatus()
    if not resCurrent['OK']:
      return resCurrent
    currentStatus = resCurrent['Value']

    resDefault = self._getDefaultComponentStatus()
    if not resDefault['OK']:
      return resDefault
    defaultStatus = resDefault['Value']

    # ensure instances are in the right state
    shouldBe = {}
    shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down'])
    shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run'])
    shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All'])

    self._ensureComponentRunning(shouldBe['Run'])
    self._ensureComponentDown(shouldBe['Down'])

    for instance in shouldBe['Unknown']:
      self.logError("Unknown instance", "%r, either uninstall or add to config" % instance)

    return S_OK()

  def _getCurrentComponentStatus(self):
    """Get current status for components."""
    resOverall = self.sysAdminClient.getOverallStatus()
    if not resOverall['OK']:
      return resOverall
    currentStatus = {'Down': set(), 'Run': set(), 'All': set()}
    informationDict = resOverall['Value']
    for systemsDict in informationDict.values():
      for system, instancesDict in systemsDict.items():
        for instanceName, instanceInfoDict in instancesDict.items():
          identifier = '%s__%s' % (system, instanceName)
          runitStatus = instanceInfoDict.get('RunitStatus')
          if runitStatus in ('Run', 'Down'):
            currentStatus[runitStatus].add(identifier)

    currentStatus['All'] = currentStatus['Run'] | currentStatus['Down']
    return S_OK(currentStatus)

  def _getDefaultComponentStatus(self):
    """Get the configured status of the components."""
    host = socket.gethostname()
    defaultStatus = {'Down': set(), 'Run': set(), 'All': set()}
    resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running'))
    resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped'))
    if not resRunning['OK']:
      return resRunning
    if not resStopped['OK']:
      return resStopped
    defaultStatus['Run'] = set(resRunning['Value'].keys())
    defaultStatus['Down'] = set(resStopped['Value'].keys())
    defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down']

    if defaultStatus['Run'].intersection(defaultStatus['Down']):
      self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down'])))
      return S_ERROR("Bad host configuration")

    return S_OK(defaultStatus)

  def _ensureComponentRunning(self, shouldBeRunning):
    """Ensure the correct components are running."""
    for instance in shouldBeRunning:
      self.log.info("Starting instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.startComponent(system, name)
        if not res['OK']:
          self.logError("Failed to start component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was down, started instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is down, should be started"

  def _ensureComponentDown(self, shouldBeDown):
    """Ensure the correct components are not running."""
    for instance in shouldBeDown:
      self.log.info("Stopping instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.stopComponent(system, name)
        if not res['OK']:
          self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was running, stopped instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is running, should be stopped"

  def checkURLs(self):
    """Ensure that the running services have their URL in the Config."""
    self.log.info("Checking URLs")
    # get services again, in case they were started/stop in controlComponents
    gConfig.forceRefresh(fromMaster=True)
    res = self.getRunningInstances(instanceType='Services', runitStatus='All')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]
    for service, options in self.services.iteritems():
      self.log.debug("Checking URL for %s with options %s" % (service, options))
      # ignore SystemAdministrator, does not have URLs
      if 'SystemAdministrator' in service:
        continue
      self._checkServiceURL(service, options)

    if self.csAPI.csModified and self.commitURLs:
      self.log.info("Commiting changes to the CS")
      result = self.csAPI.commit()
      if not result['OK']:
        self.logError('Commit to CS failed', result['Message'])
        return S_ERROR("Failed to commit to CS")
    return S_OK()

  def _checkServiceURL(self, serviceName, options):
    """Ensure service URL is properly configured in the CS."""
    url = self._getURL(serviceName, options)
    system = options['System']
    module = options['Module']
    self.log.info("Checking URLs for %s/%s" % (system, module))
    urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module)
    urls = gConfig.getValue(urlsConfigPath, [])
    self.log.debug("Found configured URLs for %s: %s" % (module, urls))
    self.log.debug("This URL is %s" % url)
    runitStatus = options['RunitStatus']
    wouldHave = 'Would have ' if not self.commitURLs else ''
    if runitStatus == 'Run' and url not in urls:
      urls.append(url)
      message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
    if runitStatus == 'Down' and url in urls:
      urls.remove(url)
      message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

  @staticmethod
  def _getURL(serviceName, options):
    """Return URL for the service."""
    system = options['System']
    port = options['Port']
    host = socket.gethostname()
    url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName)
    return url
    def do_install(self, args):
        """
        Install various DIRAC components

        usage:

          install mysql
          install db <database>
          install service <system> <service> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install agent <system> <agent> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install executor <system> <executor> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
    """
        argss = args.split()
        if not argss:
            print self.do_install.__doc__
            return

        option = argss[0]
        del argss[0]
        if option == "mysql":
            print "Installing MySQL database, this can take a while ..."
            client = SystemAdministratorClient(self.host, self.port)
            if InstallTools.mysqlPassword == 'LocalConfig':
                InstallTools.mysqlPassword = ''
            InstallTools.getMySQLPasswords()
            result = client.installMySQL(InstallTools.mysqlRootPwd,
                                         InstallTools.mysqlPassword)
            if not result['OK']:
                self.__errMsg(result['Message'])
            else:
                print "MySQL:", result['Value']
                print "You might need to restart SystemAdministrator service to take new settings into account"
        elif option == "db":
            if not argss:
                print self.do_install.__doc__
                return
            database = argss[0]
            client = SystemAdministratorClient(self.host, self.port)

            result = client.getAvailableDatabases()
            if not result['OK']:
                self.__errMsg("Can not get database list: %s" %
                              result['Message'])
                return
            if not result['Value'].has_key(database):
                self.__errMsg("Unknown database %s: " % database)
                return
            system = result['Value'][database]['System']
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if not setup:
                self.__errMsg("Unknown current setup")
                return
            instance = gConfig.getValue(
                '/DIRAC/Setups/%s/%s' % (setup, system), '')
            if not instance:
                self.__errMsg("No instance defined for system %s" % system)
                self.__errMsg(
                    "\tAdd new instance with 'add instance %s <instance_name>'"
                    % system)
                return

            if not InstallTools.mysqlPassword:
                InstallTools.mysqlPassword = '******'
            InstallTools.getMySQLPasswords()
            result = client.installDatabase(database,
                                            InstallTools.mysqlRootPwd)
            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            extension, system = result['Value']
            # result = client.addDatabaseOptionsToCS( system, database )
            InstallTools.mysqlHost = self.host
            result = client.getInfo()
            if not result['OK']:
                self.__errMsg(result['Message'])
            hostSetup = result['Value']['Setup']
            result = InstallTools.addDatabaseOptionsToCS(
                gConfig, system, database, hostSetup)
            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            print "Database %s from %s/%s installed successfully" % (
                database, extension, system)
        elif option in ["service", "agent", "executor"]:
            if len(argss) < 2:
                print self.do_install.__doc__
                return

            system = argss[0]
            del argss[0]
            component = argss[0]
            del argss[0]

            specialOptions = {}
            module = ''
            for i in range(len(argss)):
                if argss[i] == "-m":
                    specialOptions['Module'] = argss[i + 1]
                    module = argss[i + 1]
                if argss[i] == "-p":
                    opt, value = argss[i + 1].split('=')
                    specialOptions[opt] = value
            if module == component:
                module = ''

            client = SystemAdministratorClient(self.host, self.port)
            # First need to update the CS
            # result = client.addDefaultOptionsToCS( option, system, component )
            InstallTools.host = self.host
            result = client.getInfo()
            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            hostSetup = result['Value']['Setup']

            # Install Module section if not yet there
            if module:
                result = InstallTools.addDefaultOptionsToCS(
                    gConfig, option, system, module, getCSExtensions(),
                    hostSetup)
                # Add component section with specific parameters only
                result = InstallTools.addDefaultOptionsToCS(
                    gConfig,
                    option,
                    system,
                    component,
                    getCSExtensions(),
                    hostSetup,
                    specialOptions,
                    addDefaultOptions=False)
            else:
                # Install component section
                result = InstallTools.addDefaultOptionsToCS(
                    gConfig, option, system, component, getCSExtensions(),
                    hostSetup, specialOptions)

            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            # Then we can install and start the component
            result = client.setupComponent(option, system, component, module)
            if not result['OK']:
                self.__errMsg(result['Message'])
                return
            compType = result['Value']['ComponentType']
            runit = result['Value']['RunitStatus']
            print "%s %s_%s is installed, runit status: %s" % (
                compType, system, component, runit)
        else:
            print "Unknown option:", option
示例#50
0
  def __componentAction( self , action = None ):

    """
    Actions which should be done on components. The only parameters is an action
    to perform.
    Returns standard JSON response structure with with service response
    or error messages
    """

    DN = getUserDN()
    group = getSelectedGroup()

    if ( not action ) or ( not len( action ) > 0 ):
      error = "Action is not defined or has zero length"
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }

    if action not in [ "restart" , "start" , "stop" , "uninstall" ]:
      error = "The request parameters action '%s' is unknown" % action
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }
    self.action = action

    result = dict()
    for i in request.params:
      if i == "action":
        continue

      target = i.split( " @ " , 1 )
      if not len( target ) == 2:
        continue

      system = request.params[ i ]
      gLogger.always( "System: %s" % system )
      host = target[ 1 ]
      gLogger.always( "Host: %s" % host )
      component = target[ 0 ]
      gLogger.always( "Component: %s" % component )
      if not host in result:
        result[ host ] = list()
      result[ host ].append( [ system , component ] )

    if not len( result ) > 0:
      error = "Failed to get component(s) for %s" % action
      gLogger.debug( error )
      return { "success" : "false" , "error" : error }
      
    gLogger.always( result )
    self.actionSuccess = list()
    self.actionFailed = list()

    for hostname in result.keys():

      if not len( result[ hostname ] ) > 0:
        continue

      client = SystemAdministratorClient( hostname , None , delegatedDN=DN ,
                                          delegatedGroup=group )

      for i in result[ hostname ]:

        system = i[ 0 ]
        component = i[ 1 ]

        try:
          if action == "restart":
            result = client.restartComponent( system , component )
          elif action == "start":
            result = client.startComponent( system , component )
          elif action == "stop":
            result = client.stopComponent( system , component )
          elif action == "uninstall":
            result = client.uninstallComponent( system , component )
          else:
            result = list()
            result[ "Message" ] = "Action %s is not valid" % action
        except Exception, x:
          result = list()
          result[ "Message" ] = "Exception: %s" % str( x )
        gLogger.debug( "Result: %s" % result )

        if not result[ "OK" ]:
          error = hostname + ": " + result[ "Message" ]
          self.actionFailed.append( error )
          gLogger.error( "Failure during component %s: %s" % ( action , error ) )
        else:
          gLogger.always( "Successfully %s component %s" % ( action , component ) )
          self.actionSuccess.append( component )
  def do_install( self, args ):
    """
        Install various DIRAC components

        usage:

          install mysql
          install db <database>
          install service <system> <service> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install agent <system> <agent> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install executor <system> <executor> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
    """
    argss = args.split()
    if not argss:
      gLogger.notice( self.do_install.__doc__ )
      return

    option = argss[0]
    del argss[0]
    if option == "mysql":
      gLogger.notice( "Installing MySQL database, this can take a while ..." )
      client = SystemAdministratorClient( self.host, self.port )
      if gComponentInstaller.mysqlPassword == 'LocalConfig':
        gComponentInstaller.mysqlPassword = ''
      gComponentInstaller.getMySQLPasswords()
      result = client.installMySQL( gComponentInstaller.mysqlRootPwd, gComponentInstaller.mysqlPassword )
      if not result['OK']:
        self._errMsg( result['Message'] )
      else:
        gLogger.notice( "MySQL:", result['Value'] )
        gLogger.notice( "You might need to restart SystemAdministrator service to take new settings into account" )
    elif option == "db":
      if not argss:
        gLogger.notice( self.do_install.__doc__ )
        return
      database = argss[0]
      client = SystemAdministratorClient( self.host, self.port )

      result = client.getAvailableDatabases()
      if not result['OK']:
        self._errMsg( "Can not get database list: %s" % result['Message'] )
        return
      if not result['Value'].has_key( database ):
        self._errMsg( "Unknown database %s: " % database )
        return
      system = result['Value'][database]['System']
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if not setup:
        self._errMsg( "Unknown current setup" )
        return
      instance = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( setup, system ), '' )
      if not instance:
        self._errMsg( "No instance defined for system %s" % system )
        self._errMsg( "\tAdd new instance with 'add instance %s <instance_name>'" % system )
        return

      if not gComponentInstaller.mysqlPassword:
        gComponentInstaller.mysqlPassword = '******'
      gComponentInstaller.getMySQLPasswords()
      result = client.installDatabase( database, gComponentInstaller.mysqlRootPwd )
      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      extension, system = result['Value']

      result = client.getHostInfo()
      if not result[ 'OK' ]:
        self._errMsg( result[ 'Message' ] )
        return
      else:
        cpu = result[ 'Value' ][ 'CPUModel' ]
      hostname = self.host
      if not result[ 'OK' ]:
        self._errMsg( result[ 'Message' ] )
        return

      if database != 'InstalledComponentsDB':
        result = MonitoringUtilities.monitorInstallation( 'DB', system.replace( 'System', '' ), database, cpu = cpu, hostname = hostname )
        if not result['OK']:
          self._errMsg( result['Message'] )
          return
      # result = client.addDatabaseOptionsToCS( system, database )
      gComponentInstaller.mysqlHost = self.host
      result = client.getInfo()
      if not result['OK']:
        self._errMsg( result['Message'] )
      hostSetup = result['Value']['Setup']
      result = gComponentInstaller.addDatabaseOptionsToCS( gConfig, system, database, hostSetup, overwrite = True )
      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      gLogger.notice( "Database %s from %s/%s installed successfully" % ( database, extension, system ) )
    elif option in self.runitComponents:
      if len( argss ) < 2:
        gLogger.notice( self.do_install.__doc__ )
        return

      system = argss[0]
      del argss[0]
      component = argss[0]
      del argss[0]

      specialOptions = {}
      module = ''
     
      for i in range(len(argss)):
        if argss[i] == "-m":
          specialOptions['Module'] = argss[i+1]
          module = argss[i+1]
        if argss[i] == "-p":
          opt,value = argss[i+1].split('=')
          specialOptions[opt] = value
      if module == component:
        module = ''

      client = SystemAdministratorClient( self.host, self.port )
      # First need to update the CS
      # result = client.addDefaultOptionsToCS( option, system, component )
      gComponentInstaller.host = self.host
      result = client.getInfo()
      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      hostSetup = result['Value']['Setup']
    
      # Install Module section if not yet there
      if module:
        result = gComponentInstaller.addDefaultOptionsToCS( gConfig, option, system, module,
                                                            getCSExtensions(), hostSetup )
        # in case of Error we must stop, this can happen when the module name is wrong...
        if not result['OK']:
          self._errMsg( result['Message'] )
          return
        # Add component section with specific parameters only
        result = gComponentInstaller.addDefaultOptionsToCS( gConfig, option, system, component,
                                                            getCSExtensions(), hostSetup, specialOptions,
                                                            addDefaultOptions = True )
      else:
        # Install component section
        result = gComponentInstaller.addDefaultOptionsToCS( gConfig, option, system, component,
                                                            getCSExtensions(), hostSetup, specialOptions )

      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      # Then we can install and start the component
      result = client.setupComponent( option, system, component, module )
      if not result['OK']:
        self._errMsg( result['Message'] )
        return
      compType = result['Value']['ComponentType']
      runit = result['Value']['RunitStatus']
      gLogger.notice( "%s %s_%s is installed, runit status: %s" % ( compType, system, component, runit ) )

      # And register it in the database
      result = client.getHostInfo()
      if not result[ 'OK' ]:
        self._errMsg( result[ 'Message' ] )
        return
      else:
        cpu = result[ 'Value' ][ 'CPUModel' ]
      hostname = self.host
      if component == 'ComponentMonitoring':
        # Make sure that the service is running before trying to use it
        nTries = 0
        maxTries = 5
        mClient = ComponentMonitoringClient()
        result = mClient.ping()
        while not result[ 'OK' ] and nTries < maxTries:
          time.sleep( 3 )
          result = mClient.ping()
          nTries = nTries + 1

        if not result[ 'OK' ]:
          self._errMsg( 'ComponentMonitoring service taking too long to start. Installation will not be logged into the database' )
          return

        result = MonitoringUtilities.monitorInstallation( 'DB', system, 'InstalledComponentsDB', cpu = cpu, hostname = hostname )
        if not result['OK']:
          self._errMsg( 'Error registering installation into database: %s' % result[ 'Message' ] )
          return
      
      result = MonitoringUtilities.monitorInstallation( option, system, component, module, cpu = cpu, hostname = hostname )
      if not result['OK']:
        self._errMsg( 'Error registering installation into database: %s' % result[ 'Message' ] )
        return
    else:
      gLogger.notice( "Unknown option:", option )
示例#52
0
class LemonAgent( AgentModule ):

  def initialize( self ):
    self.NON_CRITICAL = "NonCritical"
    self.CRITICAL = "Critical"
    self.FAILURE = "FAILURE"
    self.OK = "OK"

    self.setup = gConfig.getValue('/DIRAC/Setup','LHCb-Development')
    self.outputNonCritical = True
    #all components not present here will be treated as non critical

    self.admClient = SystemAdministratorClient('localhost')

    return S_OK()

  def execute( self ):
    """ Main execution method
    """

    monitoredSetups = gConfig.getValue('/Operations/lhcb/Lemon/MonitoredSetups', ['LHCb-Production'])
    self.monitoringEnabled = self.setup in monitoredSetups

    if not self.monitoringEnabled:
      self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "Monitoring not enabled for this setup: " + self.setup +". Exiting.");
      return S_OK()

    hostsInMaintenance = gConfig.getValue('/Operations/lhcb/Lemon/HostsInMaintenance',[]);
    if gethostname() in hostsInMaintenance:
      self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "I am in maintenance mode, exiting.");
      return S_OK()

    result = self.admClient.getOverallStatus()

    if not result or not result['OK']:
      self._log("Framework/LemonAgent", self.CRITICAL, self.FAILURE, "Can not obtain result!!");
      return S_OK()

    services = result[ 'Value' ][ 'Services' ]
    agents = result[ 'Value' ][ 'Agents' ]
    self._processResults(services);
    self._processResults(agents);

    return S_OK()

  def _processResults(self, results):
    for system in results:
      for part in results[system]:
        component = results[system][part]
        componentName = system + "/" + part
        if component['Setup'] == True:   #we want to monitor only set up services and agents
          critLevel = self._getCriticality(componentName)
          if critLevel == self.NON_CRITICAL and self.outputNonCritical == False:
            continue
          if component['RunitStatus'] == 'Run':
            self._log(componentName, self._getCriticality(componentName), self.OK, "Service/Agent running fine");
          else:
            self._log(componentName, self._getCriticality(componentName), self.FAILURE, "Service/Agent failure!");
    #    else:
    #      if component['Installed'] == True:
    #        print componentName + " is installed but not set up"

  def _getCriticality(self, component):
    #lets try to retrieve common criticality first
    criticality = gConfig.getValue('/Operations/lhcb/Lemon/Criticalities/' + component, self.NON_CRITICAL)
    #maybe it got redefined in <setup> subtree:
    criticality = gConfig.getValue('/Operations/lhcb/' + self.setup + '/Lemon/Criticalities/' + component, criticality)
    return criticality

  def _log( self, component, criticality, status, string ):
    gLogger.info( "LEMON " + criticality + " " + status + " " + component + ": " +string + "\n")
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show host          - show host related parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      print self.do_show.__doc__
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "Current project is %s" % result[ 'Value' ]
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)  
        printTable(fields,records)        
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self.__errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      print
      for db in sw:
        if db in installed:
          print db.rjust( 25 ), ': Installed'
        else:
          print db.rjust( 25 ), ': Not installed'
      if not sw:
        print "No database found"
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      elif result['Value']:
        print
        for par, value in result['Value'].items():
          print par.rjust( 28 ), ':', value
      else:
        print "No MySQL database found"
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print
        print "Setup:", result['Value']['Setup']
        print "DIRAC version:", result['Value']['DIRAC']
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            print "%s version" % e, v
        print
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:   
        print   
        print "Host info:"
        print
        
        fields = ['Parameter','Value']
        records = []
        for key,value in result['Value'].items():
          records.append( [key, str(value) ] )
          
        printTable( fields, records )  
            
    elif option == "errors":
      self.getErrors( argss )
    else:
      print "Unknown option:", option
  def do_show( self, args ):
    """
        Show list of components with various related information

        usage:

          show software      - show components for which software is available
          show installed     - show components installed in the host with runit system
          show setup         - show components set up for automatic running in the host
          show project       - show project to install or upgrade
          show status        - show status of the installed components
          show database      - show status of the databases
          show mysql         - show status of the MySQL server
          show log  <system> <service|agent> [nlines]
                             - show last <nlines> lines in the component log file
          show info          - show version of software and setup
          show doc <type> <system> <name>
                             - show documentation for a given service or agent
          show host          - show host related parameters
          show hosts         - show all available hosts
          show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before>
                              | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]*
                             - show all the installations of components that match the given parameters
          show errors [*|<system> <service|agent>]
                             - show error count for the given component or all the components
                               in the last hour and day
    """

    argss = args.split()
    if not argss:
      gLogger.notice( self.do_show.__doc__ )
      return

    option = argss[0]
    del argss[0]

    if option == 'software':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSoftwareComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'installed':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInstalledComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'setup':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getSetupComponents()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        pprint.pprint( result['Value'] )
    elif option == 'project':
      result = SystemAdministratorClient( self.host, self.port ).getProject()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( "Current project is %s" % result[ 'Value' ] )
    elif option == 'status':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getOverallStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID']
        records = []
        rDict = result['Value']
        for compType in rDict:
          for system in rDict[compType]:
            components = rDict[compType][system].keys()
            components.sort()
            for component in components:
              record = []
              if rDict[compType][system][component]['Installed']:
                module = str( rDict[compType][system][component]['Module'] )
                record += [ system,component,module,compType.lower()[:-1]]
                if rDict[compType][system][component]['Setup']:
                  record += ['Setup']
                else:
                  record += ['NotSetup']
                if rDict[compType][system][component]['Installed']:
                  record += ['Installed']
                else:
                  record += ['NotInstalled']
                record += [str( rDict[compType][system][component]['RunitStatus'] )]
                record += [str( rDict[compType][system][component]['Timeup'] )]
                record += [str( rDict[compType][system][component]['PID'] )]
                records.append(record)  
        printTable(fields,records)        
    elif option == 'database' or option == 'databases':
      client = SystemAdministratorClient( self.host, self.port )
      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = "******"
      InstallTools.getMySQLPasswords()
      result = client.getDatabases( InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      resultSW = client.getAvailableDatabases()
      if not resultSW['OK']:
        self.__errMsg( resultSW['Message'] )
        return

      sw = resultSW['Value']
      installed = result['Value']
      gLogger.notice( '' )
      for db in sw:
        if db in installed:
          gLogger.notice( db.rjust( 25 ), ': Installed' )
        else:
          gLogger.notice( db.rjust( 25 ), ': Not installed' )
      if not sw:
        gLogger.notice( "No database found" )
    elif option == 'mysql':
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getMySQLStatus()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      elif result['Value']:
        gLogger.notice( '' )
        for par, value in result['Value'].items():
          gLogger.notice( ( par.rjust( 28 ), ':', value ) )
      else:
        gLogger.notice( "No MySQL database found" )
    elif option == "log":
      self.getLog( argss )
    elif option == "info":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        gLogger.notice( '' )
        gLogger.notice( "Setup:", result['Value']['Setup'] )
        gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] )
        if result['Value']['Extensions']:
          for e, v in result['Value']['Extensions'].items():
            gLogger.notice( "%s version" % e, v )
        gLogger.notice( '' )
    elif option == "host":
      client = SystemAdministratorClient( self.host, self.port )
      result = client.getHostInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:   
        gLogger.notice( '' )
        gLogger.notice( "Host info:" )
        gLogger.notice( '' )
        
        fields = ['Parameter','Value']
        records = []
        for key, value in result['Value'].items():
          records.append( [key, str( value ) ] )
          
        printTable( fields, records )  
    elif option == "hosts":
      client = ComponentMonitoringClient()
      result = client.getHosts( {}, False, False )
      if not result[ 'OK' ]:
        self.__errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) )
      else:
        hostList = result[ 'Value' ]
        gLogger.notice( '' )
        gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' )
        gLogger.notice( ( '-' * 69 ) )
        for element in hostList:
          gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' )
        gLogger.notice( ( '-' * 69 ) )
        gLogger.notice( '' )
    elif option == "errors":
      self.getErrors( argss )
    elif option == "installations":
      self.getInstallations( argss )
    elif option == "doc":
      if len( argss ) > 2:
        if argss[0] in [ 'service', 'agent' ]:
          compType = argss[0]
          compSystem = argss[1]
          compModule = argss[2]
          client = SystemAdministratorClient( self.host, self.port )
          result = client.getComponentDocumentation( compType, compSystem, compModule )
          if result[ 'OK' ]:
            gLogger.notice( result[ 'Value' ] )
          else:
            self.__errMsg( result[ 'Message' ] )
        else:
          gLogger.notice( self.do_show.__doc__ )
      else:
        gLogger.notice( self.do_show.__doc__ )
    else:
      gLogger.notice( "Unknown option:", option )
  def do_install( self, args ):
    """
        Install various DIRAC components

        usage:

          install mysql
          install db <database>
          install service <system> <service> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install agent <system> <agent> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
          install executor <system> <executor> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ...
    """
    argss = args.split()
    if not argss:
      print self.do_install.__doc__
      return

    option = argss[0]
    del argss[0]
    if option == "mysql":
      print "Installing MySQL database, this can take a while ..."
      client = SystemAdministratorClient( self.host, self.port )
      if InstallTools.mysqlPassword == 'LocalConfig':
        InstallTools.mysqlPassword = ''
      InstallTools.getMySQLPasswords()
      result = client.installMySQL( InstallTools.mysqlRootPwd, InstallTools.mysqlPassword )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "MySQL:", result['Value']
        print "You might need to restart SystemAdministrator service to take new settings into account"
    elif option == "db":
      if not argss:
        print self.do_install.__doc__
        return
      database = argss[0]
      client = SystemAdministratorClient( self.host, self.port )

      result = client.getAvailableDatabases()
      if not result['OK']:
        self.__errMsg( "Can not get database list: %s" % result['Message'] )
        return
      if not result['Value'].has_key( database ):
        self.__errMsg( "Unknown database %s: " % database )
        return
      system = result['Value'][database]['System']
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if not setup:
        self.__errMsg( "Unknown current setup" )
        return
      instance = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( setup, system ), '' )
      if not instance:
        self.__errMsg( "No instance defined for system %s" % system )
        self.__errMsg( "\tAdd new instance with 'add instance %s <instance_name>'" % system )
        return

      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = '******'
      InstallTools.getMySQLPasswords()
      result = client.installDatabase( database, InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      extension, system = result['Value']
      # result = client.addDatabaseOptionsToCS( system, database )
      InstallTools.mysqlHost = self.host
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      hostSetup = result['Value']['Setup']
      result = InstallTools.addDatabaseOptionsToCS( gConfig, system, database, hostSetup )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      print "Database %s from %s/%s installed successfully" % ( database, extension, system )
    elif option in ["service","agent","executor"] :
      if len( argss ) < 2:
        print self.do_install.__doc__
        return

      system = argss[0]
      del argss[0]
      component = argss[0]
      del argss[0]
      
      specialOptions = {}
      module = ''
      for i in range(len(argss)):
        if argss[i] == "-m":
          specialOptions['Module'] = argss[i+1]
          module = argss[i+1]
        if argss[i] == "-p":
          opt,value = argss[i+1].split('=')
          specialOptions[opt] = value           
      if module == component:
        module = ''
      
      client = SystemAdministratorClient( self.host, self.port )
      # First need to update the CS
      # result = client.addDefaultOptionsToCS( option, system, component )
      InstallTools.host = self.host
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      hostSetup = result['Value']['Setup']
      
      # Install Module section if not yet there
      if module:
        result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, module, 
                                                     getCSExtensions(), hostSetup )
        # Add component section with specific parameters only
        result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, component, 
                                                     getCSExtensions(), hostSetup, specialOptions, 
                                                     addDefaultOptions = False )
      else:  
        # Install component section
        result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, component, 
                                                     getCSExtensions(), hostSetup, specialOptions )
    
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      # Then we can install and start the component
      result = client.setupComponent( option, system, component, module )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      compType = result['Value']['ComponentType']
      runit = result['Value']['RunitStatus']
      print "%s %s_%s is installed, runit status: %s" % ( compType, system, component, runit )
    else:
      print "Unknown option:", option
  def do_install( self, args ):
    """ 
        Install various DIRAC components 
    
        usage:
        
          install mysql
          install db <database>
          install service <system> <service>
          install agent <system> <agent>
    """
    argss = args.split()
    if not argss:
      print self.do_install.__doc__
      return

    option = argss[0]
    del argss[0]
    if option == "mysql":
      print "Installing MySQL database, this can take a while ..."
      client = SystemAdministratorClient( self.host, self.port )
      if InstallTools.mysqlPassword == 'LocalConfig':
        InstallTools.mysqlPassword = ''
      InstallTools.getMySQLPasswords()
      result = client.installMySQL( InstallTools.mysqlRootPwd, InstallTools.mysqlPassword )
      if not result['OK']:
        self.__errMsg( result['Message'] )
      else:
        print "MySQL:", result['Value']
        print "You might need to restart SystemAdministrator service to take new settings into account"
    elif option == "db":
      if not argss:
        print self.do_install.__doc__
        return
      database = argss[0]
      client = SystemAdministratorClient( self.host, self.port )

      result = client.getAvailableDatabases()
      if not result['OK']:
        self.__errMsg( "Can not get database list: %s" % result['Message'] )
        return
      if not result['Value'].has_key( database ):
        self.__errMsg( "Unknown database %s: " % database )
        return
      system = result['Value'][database]['System']
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if not setup:
        self.__errMsg( "Unknown current setup" )
        return
      instance = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( setup, system ), '' )
      if not instance:
        self.__errMsg( "No instance defined for system %s" % system )
        self.__errMsg( "\tAdd new instance with 'add instance %s <instance_name>'" % system )
        return

      if not InstallTools.mysqlPassword:
        InstallTools.mysqlPassword = '******'
      InstallTools.getMySQLPasswords()
      result = client.installDatabase( database, InstallTools.mysqlRootPwd )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      extension, system = result['Value']
      # result = client.addDatabaseOptionsToCS( system, database )
      InstallTools.mysqlHost = self.host
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
      hostSetup = result['Value']['Setup']
      result = InstallTools.addDatabaseOptionsToCS( gConfig, system, database, hostSetup )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      print "Database %s from %s/%s installed successfully" % ( database, extension, system )
    elif option == "service" or option == "agent":
      if len( argss ) < 2:
        print self.do_install.__doc__
        return

      system = argss[0]
      component = argss[1]
      client = SystemAdministratorClient( self.host, self.port )
      # First need to update the CS
      # result = client.addDefaultOptionsToCS( option, system, component )
      InstallTools.host = self.host
      result = client.getInfo()
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      hostSetup = result['Value']['Setup']
      result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, component, getCSExtensions(), hostSetup )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      # Then we can install and start the component
      result = client.setupComponent( option, system, component )
      if not result['OK']:
        self.__errMsg( result['Message'] )
        return
      compType = result['Value']['ComponentType']
      runit = result['Value']['RunitStatus']
      print "%s %s_%s is installed, runit status: %s" % ( compType, system, component, runit )
    else:
      print "Unknown option:", option