示例#1
0
def SetInitState(cfg, state):
  """Sets system's initialization state. For oneway, it stores it in
  C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file
  /ls/ent<version>/ENT_SYSTEM_INIT_STATE.

  @param cfg - of type configurator.
  @param state - string
  """
  # oneway?
  if 1 == len(core_utils.GetNodes()):
    cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state)
    return

  tmpfile = E.mktemp('/export/hda3/tmp')
  try:
    f = open(tmpfile, 'w')
    f.write(state)
    f.close()
  except IOError:
    logging.fatal('Cannot write to temp file %s' % tmpfile)
    return
  version = cfg.getGlobalParam('VERSION')
  lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version))
  chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version)
  write_cmd =  '%s cp %s %s/%s' % (lockserv_cmd_prefix,
      tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE')
  logging.info('setting system init state to: %s', state)
  E.exe_or_fail(write_cmd)
  E.exe('rm -rf %s' % tmpfile)
示例#2
0
def AvoidGFSMasterOnNode(ver, testver, node):
    """ best efforts to make a node a none-primary master

  Arguments:
    ver:     '4.6.5'
    testver: 0 - not a test version. 1 - test version.
    node:    'ent4'

  Returns:
    'ent1' if ent1 is the primary gfs_master
    None if could not find out.
  """

    nodes = core_utils.GetNodes()
    if len(nodes) == 1:
        return

    (all_gfs_masters, shadow_gfs_masters) = core_utils.GFSMasterNodes()
    # assuming we have at least 2 nodes running gfs master
    if all_gfs_masters[0] == node:
        desired_node = all_gfs_masters[1]
    else:
        desired_node = all_gfs_masters[0]
    if desired_node != GFSMasterLockHolder(ver, testver):
        ForceGFSPrimaryMaster(testver, desired_node)
示例#3
0
def GFSMasterLockHolder(ver, testver):
    """ find the primary gfs_mater using chubby

  Arguments:
    ver:     '4.6.5'
    testver: 0 - not a test version. 1 - test version.

  Returns:
    'ent1' if ent1 is the primary gfs_master
    None if could not find out.
  """

    nodes = core_utils.GetNodes()
    if len(nodes) == 1:
        return None

    lockfile = '/ls/%s/gfs/ent/master-lock' % core_utils.GetCellName(ver)
    basecmd = core_utils.GetLSClientCmd(ver, testver)
    fi = os.popen('%s cat %s' % (basecmd, lockfile), 'r')
    data = fi.read()
    ret = fi.close()
    if ret:
        return None
    if data.startswith('ent'):
        return data.split(':', 2)[0]
    return None
def CheckGFSState(ver, testver, nodes=None, gfs_status_set=0, gfs_status=None):
    """Checks the GFS Master Election Status.

  Args:
    ver: GSA version
    testver: if its test mode
    nodes: Nodes on the box (for unit test)
    gfs_status_set: is caller sending gfs_status already
    gfs_status: Unit test only, the GFS status.

  Return:
    1 if State is not good (Master election hasn't happened yet). 0 otherwise.
    Always returns 0 for oneways.
  """
    if not nodes:
        nodes = core_utils.GetNodes()
    if len(nodes) == 1:
        return 0

    if not gfs_status_set:
        # print 'finding gfs_status for ver=%s testver=%s' % (ver, testver)
        gfs_status = gfs_utils.CheckNoMasterUsingElectionStatus(ver, testver)

    if gfs_status == 0:
        return 0

    return 1
示例#5
0
def _StatusFileCmd(cmd, version, out=[], extra_arg='', unittestdir=None):
    """Perform a command on the RESET_STATE status file.

  On a cluster, runs lockserv <cmd> /ls/ent4-x-x/RESET_STATE.
  On a oneway, runs cmd on /export/hda3/4.x.x/RESET_STATE
  cmd should be cat, setcontents, or rm.
  Return: None for oneway, 0 for success, 1 for error
  Command output returned in out.
  """

    if unittestdir != None or 1 == len(core_utils.GetNodes()):
        # unitest or Oneway
        if unittestdir != None:
            file = '/%s/%s/RESET_STATE' % (unittestdir, version)
        else:
            file = '/export/hda3/%s/RESET_STATE' % version
        if cmd == 'cat':
            status = _ExecuteCommand('cat %s' % file, out=out)
        elif cmd == 'setcontents':
            status = _ExecuteCommand('echo "%s" > %s' % (extra_arg, file))
        elif cmd == 'rm':
            status = _ExecuteCommand('rm -f %s' % file)
        else:
            logging.error('StatusFileCmd: bad command %s' % cmd)
            return 1
        return status

    lockserv_cmd_prefix = core_utils.GetLSClientCmd(
        version, install_utilities.is_test(version))
    chubby_file = '/ls/%s/RESET_STATE' % core_utils.GetCellName(version)
    lockserv_cmd = '%s %s %s %s' % (lockserv_cmd_prefix, cmd, chubby_file,
                                    extra_arg)
    logging.info('Reset index: executing %s' % lockserv_cmd)
    status = _ExecuteCommand(lockserv_cmd)
    return status
def main(argv):
    # Check if the box is cluster first.
    nodes = core_utils.GetNodes()
    if len(nodes) == 1:
        sys.exit(0)

    if len(argv) < 2:
        usage()
        sys.exit(1)
    ver = argv[1]

    if len(argv) < 3:
        argv.append(0)
    testver = int(argv[2])

    # Check the GFS master election status
    status = CheckGFSState(ver, testver, nodes, 0, None)
    if status == 0:
        logging.info('GFS status is good, exiting')
        sys.exit(0)

    # return a non zero status.
    sys.exit(1)
示例#7
0
def GetInitState(entcfg):
  """Returns System's initialization state. For oneway, it is the value of
  C.ENT_SYSTEM_INIT_STATE and for clusters, it is the value stored in chubby
  file /ls/ent<version>/ENT_SYTEM_INIT_STATE.

  If chubby file is non existent, it returns state C.FRESH.

  @param entcfg - of type googleconfig.
  @return - state
  """
  # oneway?
  if 1 == len(core_utils.GetNodes()):
    return entcfg.var(C.ENT_SYSTEM_INIT_STATE)

  # For cluster, get the state from chubby.
  version = entcfg.var('VERSION')
  lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version))
  chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version)

  # Verify that chubby is functional. We do not want to accidentally return
  # FRESH state that can result in total wipe out of data.
  ls_cmd = '%s ls %s' % (lockserv_cmd_prefix, chubby_root_dir)
  (status, output) = E.getstatusoutput(ls_cmd)
  if E.ERR_OK != status:
    logging.fatal('GetInitState: Could not talk to chubby.')
    return None

  cat_cmd = '%s cat %s/%s' % (lockserv_cmd_prefix, chubby_root_dir,
                              'ENT_SYSTEM_INIT_STATE')
  (status, state) = E.getstatusoutput(cat_cmd)
  if E.ERR_OK != status:
    # For fresh install, file init_state won't exist in chubby yet.
    # Hence, consider this as a FRESH state.
    state = C.FRESH
  logging.info('current system init state: %s', state)
  return state
    def change_install_state(self):
        """
    Tries to change the state of the present version to target_state.
    Returns true in case of success.
    Here is sumary of what it does:
      1. Get list of active nodes
      2. Get list of services to start and stop
      3. In case there is something to start
          a. reconfigure's net on all nodes after verifying quorum
          b. starts core servics
      4. Verifies there is a master elected.
      5. Starts thread for each node to start and stop the needed services
      6. Waits for output from each thread
      7. Calculates success of failure based on thread results
      8. Asks each thread to print its status regarding what services
         it actually started or stopped and what was the return code and
         error message if any.
    """
        if not install_utilities.safe_transition(self.version_,
                                                 self.target_state_):
            return 0

        current_state = install_utilities.install_state(self.version_)

        start = time.time()
        # make sure svs is running
        svs_utilities.CheckSvsAlive(self.machines_)

        # get list of active nodes
        active_nodes = core_utils.GetLiveNodes(logging, self.retry_)
        ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes())
        # account for already inactive nodes
        ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes))
        ver = self.version_
        home = self.enthome_

        # See what we have to start / stop
        services_to_start = install_utilities.state_services_to_start(
            self.target_state_, self.machines_)
        services_to_stop = install_utilities.state_services_to_stop(
            install_utilities.install_state(self.version_), self.machines_)

        # Make some decisions
        total_nodes = len(self.cp_.var('ENT_ALL_MACHINES'))
        onebox = (total_nodes == 1)
        startcore = services_to_start and not onebox and not self.nonecore_only_
        checkquorum = startcore
        stopcore = (services_to_stop and not onebox and not self.nonecore_only_
                    and self.target_state_ == 'INACTIVE')
        doservices = (not self.core_only_
                      and (services_to_start or services_to_stop))
        if self.target_state_ in ['INACTIVE']:
            # ent_core does not really know the state. install_manager
            # has to tell ent_core when "makeinactive"
            testver = install_utilities.install_state(self.version_)
        else:
            testver = self.target_state_ in ['TEST', 'INSTALL']
        # If it is onebox and target state is INSTALL, do not run reconfigure_net
        # This is to support pre 4.4 version migration code.
        reconfigurenet_enabled = not (onebox and
                                      (self.target_state_ == 'INSTALL'))

        # if stop coreonly services, check if none-core components are running
        if (install_utilities.install_state(self.version_) == 'ACTIVE'
                and self.target_state_ == 'INACTIVE' and self.core_only_):
            logging.fatal("cannot stop core services while none core services "\
                          "are running")

        # Execute the decisions
        if checkquorum:
            # We check quorum only when services are to be started.
            # We mainly need quorum for core services. For non core services like
            # crawl, logcontrol etc. we use users specified machines.
            core_utils.VerifyQuorum(active_nodes)

        # check if syslogd.conf and klogd.conf exist
        install_utilities.check_klogd_syslogd_conf(active_nodes, home)

        # Kill any spurious adminrunner/adminconsole processes if we are entering
        # TEST or ACTIVE mode.
        if self.target_state_ in ['TEST', 'ACTIVE']:
            install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                           core_utils.GetNodes(1))

        # reconfigure without restarting gems
        success = 1
        if reconfigurenet_enabled and services_to_start:
            # check if we need to force NTP reconfig if this is to upgrade from 4.4
            force_ntp_reconfig = 0
            if self.target_state_ in ['TEST', 'ACTIVE']:
                last_version = install_utilities.get_latest_version(
                    except_for=1)
                if (last_version is None or version_utilities.CmpVersions(
                        last_version, NEW_NTP_OPTION_GSA_VERSION) > 0):
                    force_ntp_reconfig = 1
            success = reconfigurenet_util.doReconfigureNet(
                self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig)
            if not success:
                logging.error('reconfigurenet failed.')

        # if start nonecore services, check if core services are running
        if (not onebox and self.nonecore_only_
                and self.target_state_ in ['TEST', 'ACTIVE']):
            core_running = install_utilities.is_core_running(ver,
                                                             home,
                                                             active_nodes,
                                                             ignore=ignore,
                                                             testver=testver)
            if not core_running:
                logging.fatal("cannot start none core services "\
                              "when core services are not running")

        # start core services if needed
        if startcore and success:
            # Retry 3 times for master verification failures
            num_retry = 3
            # it is always OK to reinit core services if the version is in
            # INSTALLED state
            self.reinitok_ = install_utilities.reinit_core_ok(ver,
                                                              home,
                                                              active_nodes,
                                                              ignore=ignore,
                                                              testver=testver)
            i = 1
            while i <= num_retry:
                # stop core services when retrying
                if i > 1:
                    time.sleep(15)
                    install_utilities.stop_core(ver,
                                                home,
                                                active_nodes,
                                                testver=testver)
                    time.sleep(15)
                i = i + 1
                # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py
                success = install_utilities.start_core(ver,
                                                       home,
                                                       active_nodes,
                                                       ignore=ignore,
                                                       testver=testver,
                                                       gfs=0)
                if not success:
                    if i <= num_retry:
                        logging.error(
                            'Error activating core services. Retrying...')
                    elif self.reinitok_:
                        # it is OK to ignore errors when trying to re-init core services
                        install_utilities.reinit_core(ver,
                                                      home,
                                                      active_nodes,
                                                      ignore=1,
                                                      testver=testver)
                        i = 1
                        self.reinitok_ = None
                    else:
                        logging.error('Error activating core services.')
                else:
                    # Make sure a master has been elected. If we go ahead without
                    # verifying the master then it will take very long time for
                    # services to be started. Making sure master is elected by now
                    # results in very quick adminrunner startup.
                    success = verify_master(ver, testver)
                    if success:
                        if not core_utils.InitDeadNodes(ver, testver,
                                                        logging) == 0:
                            logging.fatal(
                                'Error updating dead nodes to the lockserver.')
                        break
                    if i <= num_retry:
                        logging.error(
                            'Error verifying the master. Retrying...')
                    elif self.reinitok_:
                        # it is OK to ignore errors when trying to re-init core services
                        install_utilities.reinit_core(ver,
                                                      home,
                                                      active_nodes,
                                                      ignore=1,
                                                      testver=testver)
                        i = 1
                        self.reinitok_ = None
                    else:
                        raise core_utils.EntMasterError, (
                            'Error getting current GSA master'
                            ' from chubby.')
            # force gsa master on the desired node
            desired_gsa_master_node = core_utils.DesiredMasterNode()
            if desired_gsa_master_node is None:
                logging.fatal('No suitable node to run GSA master')
            logging.info('Forcing %s to become GSA master' %
                         desired_gsa_master_node)
            find_master.ForceMaster(desired_gsa_master_node, testver)

            # make sure the transaction logs are in sync and start gfs
            success = install_utilities.start_gfs(ver,
                                                  home,
                                                  active_nodes,
                                                  ignore=ignore,
                                                  testver=testver)

            # make sure gfs master is not the GSA master node
            logging.info('Ensuring %s not to become GFS master' %
                         desired_gsa_master_node)
            gfs_utils.AvoidGFSMasterOnNode(ver, testver,
                                           desired_gsa_master_node)

        if doservices and success:
            node_threads = {}
            for n in self.machines_:
                node_threads[n] = NodeInstallManager(n, self.target_state_,
                                                     self.version_,
                                                     services_to_start,
                                                     services_to_stop)

            # start node threads
            for (n, t) in node_threads.items():
                logging.info('STATUS: Starting thread for %s' % n)
                t.start()

            # wait for threads
            for (n, t) in node_threads.items():
                t.join()
                success = success and (t.err_ == 0)

            for (n, t) in node_threads.items():
                t.print_status()

        if stopcore and success:
            func = lambda: install_utilities.stop_core(
                ver, home, active_nodes, testver=testver)
            success = try_repeatedly(func, success=1)
            if not success:
                logging.error('Error inactivating core services.')

        # Start/Stop Borgmon and Reactor
        if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'):
            enable_external_borgmon = '--enable_external'
        else:
            enable_external_borgmon = '--noenable_external'
        borgmon_cmd = (
            "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py "
            "--ver %s --logtostderr %s" %
            (self.version_, self.version_, enable_external_borgmon))
        if success and current_state != self.target_state_:
            # 1) Stop Borgmon and Reactor if required
            if current_state in ['SERVE', 'TEST', 'ACTIVE']:
                E.execute(self.machines_,
                          "%s --mode %s --stop" % (borgmon_cmd, current_state),
                          None, 0)
            # 2) Start Borgmon and Reactor if required
            logging.info("target_state: %s" % self.target_state_)
            if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
                E.execute(
                    self.machines_,
                    "%s --mode %s --start" % (borgmon_cmd, self.target_state_),
                    None, 0)

        # Start/Stop Session Manager only for oneways
        if core_utils.GetTotalNodes() == 1:
            if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']:
                sessionmanager_util.ActivateSessionManager(ver, testver)
            if self.target_state_ == 'INACTIVE' and success:
                sessionmanager_util.DeactivateSessionManager(ver, testver)

        # Kill any spurious adminrunner/adminconsole processes if we are entering
        # INACTIVE or SERVE mode.
        if self.target_state_ in ['SERVE', 'INACTIVE']:
            install_utilities.kill_service(['adminrunner', 'adminconsole'],
                                           core_utils.GetNodes(1))

        if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_:
            install_utilities.InactivateCleanup(ver, home, active_nodes)

        end = time.time()
        diff = (end - start) / 60
        logging.info("STAT: change_install_state took %.2f minutes." % diff)
        return success