def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = ( ". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = (". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def change_install_state(self): """ Tries to change the state of the present version to target_state. Returns true in case of success. Here is sumary of what it does: 1. Get list of active nodes 2. Get list of services to start and stop 3. In case there is something to start a. reconfigure's net on all nodes after verifying quorum b. starts core servics 4. Verifies there is a master elected. 5. Starts thread for each node to start and stop the needed services 6. Waits for output from each thread 7. Calculates success of failure based on thread results 8. Asks each thread to print its status regarding what services it actually started or stopped and what was the return code and error message if any. """ if not install_utilities.safe_transition(self.version_, self.target_state_): return 0 current_state = install_utilities.install_state(self.version_) start = time.time() # make sure svs is running svs_utilities.CheckSvsAlive(self.machines_) # get list of active nodes active_nodes = core_utils.GetLiveNodes(logging, self.retry_) ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes()) # account for already inactive nodes ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes)) ver = self.version_ home = self.enthome_ # See what we have to start / stop services_to_start = install_utilities.state_services_to_start( self.target_state_, self.machines_) services_to_stop = install_utilities.state_services_to_stop( install_utilities.install_state(self.version_), self.machines_) # Make some decisions total_nodes = len(self.cp_.var('ENT_ALL_MACHINES')) onebox = (total_nodes == 1) startcore = services_to_start and not onebox and not self.nonecore_only_ checkquorum = startcore stopcore = (services_to_stop and not onebox and not self.nonecore_only_ and self.target_state_ == 'INACTIVE') doservices = (not self.core_only_ and (services_to_start or services_to_stop)) if self.target_state_ in ['INACTIVE']: # ent_core does not really know the state. install_manager # has to tell ent_core when "makeinactive" testver = install_utilities.install_state(self.version_) else: testver = self.target_state_ in ['TEST', 'INSTALL'] # If it is onebox and target state is INSTALL, do not run reconfigure_net # This is to support pre 4.4 version migration code. reconfigurenet_enabled = not (onebox and (self.target_state_ == 'INSTALL')) # if stop coreonly services, check if none-core components are running if (install_utilities.install_state(self.version_) == 'ACTIVE' and self.target_state_ == 'INACTIVE' and self.core_only_): logging.fatal("cannot stop core services while none core services "\ "are running") # Execute the decisions if checkquorum: # We check quorum only when services are to be started. # We mainly need quorum for core services. For non core services like # crawl, logcontrol etc. we use users specified machines. core_utils.VerifyQuorum(active_nodes) # check if syslogd.conf and klogd.conf exist install_utilities.check_klogd_syslogd_conf(active_nodes, home) # Kill any spurious adminrunner/adminconsole processes if we are entering # TEST or ACTIVE mode. if self.target_state_ in ['TEST', 'ACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) # reconfigure without restarting gems success = 1 if reconfigurenet_enabled and services_to_start: # check if we need to force NTP reconfig if this is to upgrade from 4.4 force_ntp_reconfig = 0 if self.target_state_ in ['TEST', 'ACTIVE']: last_version = install_utilities.get_latest_version(except_for=1) if (last_version is None or version_utilities.CmpVersions(last_version, NEW_NTP_OPTION_GSA_VERSION) > 0): force_ntp_reconfig = 1 success = reconfigurenet_util.doReconfigureNet(self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig) if not success: logging.error('reconfigurenet failed.') # if start nonecore services, check if core services are running if (not onebox and self.nonecore_only_ and self.target_state_ in ['TEST', 'ACTIVE']): core_running = install_utilities.is_core_running(ver, home, active_nodes, ignore=ignore, testver=testver) if not core_running: logging.fatal("cannot start none core services "\ "when core services are not running") # start core services if needed if startcore and success: # Retry 3 times for master verification failures num_retry = 3 # it is always OK to reinit core services if the version is in # INSTALLED state self.reinitok_ = install_utilities.reinit_core_ok(ver, home, active_nodes, ignore=ignore, testver=testver) i = 1 while i <= num_retry: # stop core services when retrying if i > 1: time.sleep(15) install_utilities.stop_core(ver, home, active_nodes, testver=testver) time.sleep(15) i = i + 1 # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py success = install_utilities.start_core(ver, home, active_nodes, ignore=ignore, testver=testver, gfs=0) if not success: if i <= num_retry: logging.error('Error activating core services. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: logging.error('Error activating core services.') else: # Make sure a master has been elected. If we go ahead without # verifying the master then it will take very long time for # services to be started. Making sure master is elected by now # results in very quick adminrunner startup. success = verify_master(ver, testver) if success: if not core_utils.InitDeadNodes(ver, testver, logging) == 0: logging.fatal('Error updating dead nodes to the lockserver.') break if i <= num_retry: logging.error('Error verifying the master. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: raise core_utils.EntMasterError, ('Error getting current GSA master' ' from chubby.') # force gsa master on the desired node desired_gsa_master_node = core_utils.DesiredMasterNode() if desired_gsa_master_node is None: logging.fatal('No suitable node to run GSA master') logging.info('Forcing %s to become GSA master' % desired_gsa_master_node) find_master.ForceMaster(desired_gsa_master_node, testver) # make sure the transaction logs are in sync and start gfs success = install_utilities.start_gfs(ver, home, active_nodes, ignore=ignore, testver=testver) # make sure gfs master is not the GSA master node logging.info('Ensuring %s not to become GFS master' % desired_gsa_master_node) gfs_utils.AvoidGFSMasterOnNode(ver, testver, desired_gsa_master_node) if doservices and success: node_threads = {} for n in self.machines_: node_threads[n] = NodeInstallManager(n, self.target_state_, self.version_, services_to_start, services_to_stop) # start node threads for (n, t) in node_threads.items(): logging.info('STATUS: Starting thread for %s' % n) t.start() # wait for threads for (n,t) in node_threads.items(): t.join() success = success and (t.err_ == 0) for (n,t) in node_threads.items(): t.print_status() if stopcore and success: func = lambda: install_utilities.stop_core(ver, home, active_nodes, testver=testver) success = try_repeatedly(func, success=1) if not success: logging.error('Error inactivating core services.') # Start/Stop Borgmon and Reactor if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'): enable_external_borgmon = '--enable_external' else: enable_external_borgmon = '--noenable_external' borgmon_cmd = ( "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py " "--ver %s --logtostderr %s" % (self.version_, self.version_, enable_external_borgmon)) if success and current_state != self.target_state_: # 1) Stop Borgmon and Reactor if required if current_state in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --stop" % (borgmon_cmd, current_state), None, 0) # 2) Start Borgmon and Reactor if required logging.info("target_state: %s" % self.target_state_) if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --start" % (borgmon_cmd, self.target_state_), None, 0) # Start/Stop Session Manager only for oneways if core_utils.GetTotalNodes() == 1: if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: sessionmanager_util.ActivateSessionManager(ver, testver) if self.target_state_ == 'INACTIVE' and success: sessionmanager_util.DeactivateSessionManager(ver, testver) # Kill any spurious adminrunner/adminconsole processes if we are entering # INACTIVE or SERVE mode. if self.target_state_ in ['SERVE', 'INACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_: install_utilities.InactivateCleanup(ver, home, active_nodes) end = time.time() diff = (end - start)/60 logging.info("STAT: change_install_state took %.2f minutes." % diff) return success
def change_install_state(self): """ Tries to change the state of the present version to target_state. Returns true in case of success. Here is sumary of what it does: 1. Get list of active nodes 2. Get list of services to start and stop 3. In case there is something to start a. reconfigure's net on all nodes after verifying quorum b. starts core servics 4. Verifies there is a master elected. 5. Starts thread for each node to start and stop the needed services 6. Waits for output from each thread 7. Calculates success of failure based on thread results 8. Asks each thread to print its status regarding what services it actually started or stopped and what was the return code and error message if any. """ if not install_utilities.safe_transition(self.version_, self.target_state_): return 0 current_state = install_utilities.install_state(self.version_) start = time.time() # make sure svs is running svs_utilities.CheckSvsAlive(self.machines_) # get list of active nodes active_nodes = core_utils.GetLiveNodes(logging, self.retry_) ignore = core_utils.GetNodeFailures(core_utils.GetTotalNodes()) # account for already inactive nodes ignore = ignore - (core_utils.GetTotalNodes() - len(active_nodes)) ver = self.version_ home = self.enthome_ # See what we have to start / stop services_to_start = install_utilities.state_services_to_start( self.target_state_, self.machines_) services_to_stop = install_utilities.state_services_to_stop( install_utilities.install_state(self.version_), self.machines_) # Make some decisions total_nodes = len(self.cp_.var('ENT_ALL_MACHINES')) onebox = (total_nodes == 1) startcore = services_to_start and not onebox and not self.nonecore_only_ checkquorum = startcore stopcore = (services_to_stop and not onebox and not self.nonecore_only_ and self.target_state_ == 'INACTIVE') doservices = (not self.core_only_ and (services_to_start or services_to_stop)) if self.target_state_ in ['INACTIVE']: # ent_core does not really know the state. install_manager # has to tell ent_core when "makeinactive" testver = install_utilities.install_state(self.version_) else: testver = self.target_state_ in ['TEST', 'INSTALL'] # If it is onebox and target state is INSTALL, do not run reconfigure_net # This is to support pre 4.4 version migration code. reconfigurenet_enabled = not (onebox and (self.target_state_ == 'INSTALL')) # if stop coreonly services, check if none-core components are running if (install_utilities.install_state(self.version_) == 'ACTIVE' and self.target_state_ == 'INACTIVE' and self.core_only_): logging.fatal("cannot stop core services while none core services "\ "are running") # Execute the decisions if checkquorum: # We check quorum only when services are to be started. # We mainly need quorum for core services. For non core services like # crawl, logcontrol etc. we use users specified machines. core_utils.VerifyQuorum(active_nodes) # check if syslogd.conf and klogd.conf exist install_utilities.check_klogd_syslogd_conf(active_nodes, home) # Kill any spurious adminrunner/adminconsole processes if we are entering # TEST or ACTIVE mode. if self.target_state_ in ['TEST', 'ACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) # reconfigure without restarting gems success = 1 if reconfigurenet_enabled and services_to_start: # check if we need to force NTP reconfig if this is to upgrade from 4.4 force_ntp_reconfig = 0 if self.target_state_ in ['TEST', 'ACTIVE']: last_version = install_utilities.get_latest_version( except_for=1) if (last_version is None or version_utilities.CmpVersions( last_version, NEW_NTP_OPTION_GSA_VERSION) > 0): force_ntp_reconfig = 1 success = reconfigurenet_util.doReconfigureNet( self.cp_, active_nodes, force_ntp_reconfig=force_ntp_reconfig) if not success: logging.error('reconfigurenet failed.') # if start nonecore services, check if core services are running if (not onebox and self.nonecore_only_ and self.target_state_ in ['TEST', 'ACTIVE']): core_running = install_utilities.is_core_running(ver, home, active_nodes, ignore=ignore, testver=testver) if not core_running: logging.fatal("cannot start none core services "\ "when core services are not running") # start core services if needed if startcore and success: # Retry 3 times for master verification failures num_retry = 3 # it is always OK to reinit core services if the version is in # INSTALLED state self.reinitok_ = install_utilities.reinit_core_ok(ver, home, active_nodes, ignore=ignore, testver=testver) i = 1 while i <= num_retry: # stop core services when retrying if i > 1: time.sleep(15) install_utilities.stop_core(ver, home, active_nodes, testver=testver) time.sleep(15) i = i + 1 # Run ent_core --ver=<ver> --activate --gfs=0 through install_utilities.py success = install_utilities.start_core(ver, home, active_nodes, ignore=ignore, testver=testver, gfs=0) if not success: if i <= num_retry: logging.error( 'Error activating core services. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: logging.error('Error activating core services.') else: # Make sure a master has been elected. If we go ahead without # verifying the master then it will take very long time for # services to be started. Making sure master is elected by now # results in very quick adminrunner startup. success = verify_master(ver, testver) if success: if not core_utils.InitDeadNodes(ver, testver, logging) == 0: logging.fatal( 'Error updating dead nodes to the lockserver.') break if i <= num_retry: logging.error( 'Error verifying the master. Retrying...') elif self.reinitok_: # it is OK to ignore errors when trying to re-init core services install_utilities.reinit_core(ver, home, active_nodes, ignore=1, testver=testver) i = 1 self.reinitok_ = None else: raise core_utils.EntMasterError, ( 'Error getting current GSA master' ' from chubby.') # force gsa master on the desired node desired_gsa_master_node = core_utils.DesiredMasterNode() if desired_gsa_master_node is None: logging.fatal('No suitable node to run GSA master') logging.info('Forcing %s to become GSA master' % desired_gsa_master_node) find_master.ForceMaster(desired_gsa_master_node, testver) # make sure the transaction logs are in sync and start gfs success = install_utilities.start_gfs(ver, home, active_nodes, ignore=ignore, testver=testver) # make sure gfs master is not the GSA master node logging.info('Ensuring %s not to become GFS master' % desired_gsa_master_node) gfs_utils.AvoidGFSMasterOnNode(ver, testver, desired_gsa_master_node) if doservices and success: node_threads = {} for n in self.machines_: node_threads[n] = NodeInstallManager(n, self.target_state_, self.version_, services_to_start, services_to_stop) # start node threads for (n, t) in node_threads.items(): logging.info('STATUS: Starting thread for %s' % n) t.start() # wait for threads for (n, t) in node_threads.items(): t.join() success = success and (t.err_ == 0) for (n, t) in node_threads.items(): t.print_status() if stopcore and success: func = lambda: install_utilities.stop_core( ver, home, active_nodes, testver=testver) success = try_repeatedly(func, success=1) if not success: logging.error('Error inactivating core services.') # Start/Stop Borgmon and Reactor if self.cp_.var('ENT_ENABLE_EXTERNAL_BORGMON'): enable_external_borgmon = '--enable_external' else: enable_external_borgmon = '--noenable_external' borgmon_cmd = ( "/export/hda3/%s/local/google3/enterprise/util/borgmon_util.py " "--ver %s --logtostderr %s" % (self.version_, self.version_, enable_external_borgmon)) if success and current_state != self.target_state_: # 1) Stop Borgmon and Reactor if required if current_state in ['SERVE', 'TEST', 'ACTIVE']: E.execute(self.machines_, "%s --mode %s --stop" % (borgmon_cmd, current_state), None, 0) # 2) Start Borgmon and Reactor if required logging.info("target_state: %s" % self.target_state_) if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: E.execute( self.machines_, "%s --mode %s --start" % (borgmon_cmd, self.target_state_), None, 0) # Start/Stop Session Manager only for oneways if core_utils.GetTotalNodes() == 1: if self.target_state_ in ['SERVE', 'TEST', 'ACTIVE']: sessionmanager_util.ActivateSessionManager(ver, testver) if self.target_state_ == 'INACTIVE' and success: sessionmanager_util.DeactivateSessionManager(ver, testver) # Kill any spurious adminrunner/adminconsole processes if we are entering # INACTIVE or SERVE mode. if self.target_state_ in ['SERVE', 'INACTIVE']: install_utilities.kill_service(['adminrunner', 'adminconsole'], core_utils.GetNodes(1)) if self.target_state_ == 'INACTIVE' and success and not self.nonecore_only_: install_utilities.InactivateCleanup(ver, home, active_nodes) end = time.time() diff = (end - start) / 60 logging.info("STAT: change_install_state took %.2f minutes." % diff) return success