def _StatusFileCmd(cmd, version, out=[], extra_arg='', unittestdir=None): """Perform a command on the RESET_STATE status file. On a cluster, runs lockserv <cmd> /ls/ent4-x-x/RESET_STATE. On a oneway, runs cmd on /export/hda3/4.x.x/RESET_STATE cmd should be cat, setcontents, or rm. Return: None for oneway, 0 for success, 1 for error Command output returned in out. """ if unittestdir != None or 1 == len(core_utils.GetNodes()): # unitest or Oneway if unittestdir != None: file = '/%s/%s/RESET_STATE' % (unittestdir, version) else: file = '/export/hda3/%s/RESET_STATE' % version if cmd == 'cat': status = _ExecuteCommand('cat %s' % file, out=out) elif cmd == 'setcontents': status = _ExecuteCommand('echo "%s" > %s' % (extra_arg, file)) elif cmd == 'rm': status = _ExecuteCommand('rm -f %s' % file) else: logging.error('StatusFileCmd: bad command %s' % cmd) return 1 return status lockserv_cmd_prefix = core_utils.GetLSClientCmd( version, install_utilities.is_test(version)) chubby_file = '/ls/%s/RESET_STATE' % core_utils.GetCellName(version) lockserv_cmd = '%s %s %s %s' % (lockserv_cmd_prefix, cmd, chubby_file, extra_arg) logging.info('Reset index: executing %s' % lockserv_cmd) status = _ExecuteCommand(lockserv_cmd) return status
def _StatusFileCmd(cmd, version, out=[], extra_arg='', unittestdir=None): """Perform a command on the RESET_STATE status file. On a cluster, runs lockserv <cmd> /ls/ent4-x-x/RESET_STATE. On a oneway, runs cmd on /export/hda3/4.x.x/RESET_STATE cmd should be cat, setcontents, or rm. Return: None for oneway, 0 for success, 1 for error Command output returned in out. """ if unittestdir != None or 1 == len(core_utils.GetNodes()): # unitest or Oneway if unittestdir != None: file = '/%s/%s/RESET_STATE' % (unittestdir, version) else: file = '/export/hda3/%s/RESET_STATE' % version if cmd == 'cat': status = _ExecuteCommand('cat %s' % file, out=out) elif cmd == 'setcontents': status = _ExecuteCommand('echo "%s" > %s' % (extra_arg, file)) elif cmd == 'rm': status = _ExecuteCommand('rm -f %s' % file) else: logging.error('StatusFileCmd: bad command %s' % cmd) return 1 return status lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, install_utilities.is_test(version)) chubby_file = '/ls/%s/RESET_STATE' % core_utils.GetCellName(version) lockserv_cmd = '%s %s %s %s' % ( lockserv_cmd_prefix, cmd, chubby_file, extra_arg) logging.info('Reset index: executing %s' % lockserv_cmd) status = _ExecuteCommand(lockserv_cmd) return status
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = ( ". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = (". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def _ClearIndex(cfg, version): """Clear the index directories in GFS/bigfiles. Return: '' for success, error for failure """ logging.info('Reset Index: ClearIndex') logging.flush() # Delete (local) urltracker data on oneway. urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY') if os.access(urltracker_dir, os.R_OK): cmd = ('rm -R -f %s' % (urltracker_dir)) logging.info('Deleting local urltracker directory: %s' % (urltracker_dir)) if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local urltracker data to delete') if cfg.getGlobalParam(C.GFS_CELL): logging.info('Deleting GFS files') gfs_aliases = core_utils.GetGFSAliases( version, install_utilities.is_test(version)) dirs_not_removed = _RemoveTopLevelDirs(cfg, '/gfs/ent/', gfs_aliases=gfs_aliases) if len(dirs_not_removed) > 0: return 'Shared file removal failed.' logging.info('Deleting bigfiles') datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME') dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir) if len(dirs_not_removed) > 0: return 'File removal failed.' # delete spelling data on oneway: spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR') if spell_root[-1] == '/': spell_root = spell_root[:-1] if os.access(spell_root, os.R_OK): cmd = ('rm -R -f %s' % spell_root) logging.info('Deleting local (non-gfs) spelling data') if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local (non-gfs) spelling data to delete') return ''
def babysit(self): if self.local_machine_is_master: if self.is_time_to_restart(): self.record_restart() self._stop(op="babysit") # adjust gsa master node. This only applies to clusters, as # desired_gsa_master_node will be None for one-way. desired_gsa_master_node = core_utils.DesiredMasterNode() if (desired_gsa_master_node != None and desired_gsa_master_node != self.local_machine): is_testver = install_utilities.is_test(self.version) find_master.ForceMaster(desired_gsa_master_node, is_testver) return 1 return self._start(op="babysit") else: return self._stop(op="babysit")
def SVSErrorsStatus(self, lockserv_cmd_out=None): """ Check SVS errors recorded by gsa-master Args: lockserv_cmd_out: {'ent1': 'machine problem Unknown\n'} (for unit test only) Return: status, desc (e.g. 0, []). status is 1 if there are SVS erros. Otherwise, status is 0. """ # Add any SVS errors (from gsa-master) to the problem list all_machs_status = 0 desc = [] if self._ent_config == 'CLUSTER': if lockserv_cmd_out is None: version = self._cfg.getGlobalParam('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, install_utilities.is_test(version)) for machine in self._live_machines: if lockserv_cmd_out is None: chubby_file = '/ls/%s/svs_%s' % (core_utils.GetCellName(version), machine) lockserv_cmd = '%s cat %s' % (lockserv_cmd_prefix, chubby_file) out = [] lockserv_status = E.execute(['localhost'], lockserv_cmd, out, 60) else: lockserv_status = 0 if machine in lockserv_cmd_out: out = [lockserv_cmd_out[machine]] else: out = [] if lockserv_status == 0 and len(out) > 0 and out[0] != '': errors = out[0].splitlines() status = 0 for i in range(0, len(errors)): if (errors[i].find('unrecoverable error') >= 0 or errors[i].find('file system error') >= 0): errors[i] = '' # Ignore this error else: status = 1 # Show an error if status: # A svs error has been recorded all_machs_status = max(all_machs_status, status) errors = [e for e in errors if e != ''] # add machine name desc.append('%s: %s' % (machine, ' '.join(errors))) return all_machs_status, desc
def init_service(self, ent_home): """ Does the actual initialization. Reads the config file in the cp (EntConfig) member and initializes some members for easy access to usual parameters """ self.cp = entconfig.EntConfig(ent_home) if not self.cp.Load(): sys.exit("Cannot load the config file %s" % self.cp.GetConfigFileName()) # Get some params for easy access self.configfile = self.cp.GetConfigFileName() self.version = str(self.cp.var("VERSION")) self.entid_tag = "ENT_ID=%s_%s" % (self.version, self.service_name) self.ent_user = self.cp.var("ENTERPRISE_USER") self.ent_group = self.cp.var("ENTERPRISE_GROUP") self.ent_bashrc = self.cp.var("ENTERPRISE_BASHRC") self.ent_home = self.cp.var("ENTERPRISE_HOME") self.googlebot_dir = self.cp.var("GOOGLEBOT_DIR") self.version_tmpdir= "%s/tmp" % self.cp.var("ENTERPRISE_HOME") self.tmpdir = self.cp.var("TMPDIR") self.logdir = self.cp.var("LOGDIR") self.datadir = self.cp.var("DATADIR") self.scripts_dir = ("%s/local/google3/enterprise/legacy/scripts" % self.ent_home) self.util_dir = ("%s/local/google3/enterprise/legacy/util" % self.ent_home) self.machines = self.cp.var("MACHINES") # The master depends on the install state : for active / test / install # we have the adminrunner on the master, else we get it from MASTER # parameter self.install_state = install_utilities.install_state( self.version, rootdir = self.cp.var('ENT_DISK_ROOT')) self.local_machine = E.getCrtHostName() testver = install_utilities.is_test(self.version) if self.install_state in ["ACTIVE", "TEST", "INSTALL"]: try: self.master_machine = find_master.FindMasterUsingChubby(self.cp.var('VERSION')) except core_utils.EntMasterError, e: # Something is seriously wrong. logging.error("ERROR: Couldn't determine master") # Assume we aren't master, so we can at least do inactivate self.master_machine = None
def _ClearIndex(cfg, version): """Clear the index directories in GFS/bigfiles. Return: '' for success, error for failure """ logging.info('Reset Index: ClearIndex') logging.flush() # Delete (local) urltracker data on oneway. urltracker_dir = cfg.getGlobalParam('URLTRACKER_DIRECTORY') if os.access(urltracker_dir, os.R_OK): cmd = ('rm -R -f %s' % (urltracker_dir)) logging.info('Deleting local urltracker directory: %s' % (urltracker_dir)) if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local urltracker data to delete') if cfg.getGlobalParam(C.GFS_CELL): logging.info('Deleting GFS files') gfs_aliases = core_utils.GetGFSAliases(version, install_utilities.is_test(version)) dirs_not_removed = _RemoveTopLevelDirs(cfg, '/gfs/ent/', gfs_aliases=gfs_aliases) if len(dirs_not_removed) > 0: return 'Shared file removal failed.' logging.info('Deleting bigfiles') datadir = '%s/data/enterprise-data' % cfg.getGlobalParam('ENTERPRISE_HOME') dirs_not_removed = _RemoveTopLevelDirs(cfg, '/bigfile/', datadir=datadir) if len(dirs_not_removed) > 0: return 'File removal failed.' # delete spelling data on oneway: spell_root = cfg.getGlobalParam('ENT_SPELL_ROOT_DIR') if spell_root[-1] == '/': spell_root = spell_root[:-1] if os.access(spell_root, os.R_OK): cmd = ('rm -R -f %s' % spell_root); logging.info('Deleting local (non-gfs) spelling data') if _ExecuteCommand(cmd, machines=cfg.getGlobalParam(C.MACHINES)): return 'File removal failed.' else: logging.info('No local (non-gfs) spelling data to delete') return ''
def AvoidGFSMasterOnNode(config, node): """ avoiding running primary gfs master on a node Arguments: config: instance of entconfig node: 'ent1' """ ver = config.VERSION testver = install_utilities.is_test(ver) # first make sure there is a primary master out = gfs_utils.EnsureGFSMasterRunning(ver, testver) if out is not None: logging.error("GFSMaster_NoMaster alert detected, " "but fix was not successful. Error message: [%s]" % out) else: gfs_utils.AvoidGFSMasterOnNode(ver, testver, node) # ensure gfs chunkservers are added after gfs master is running gfs_utils.AddGFSChunkservers(ver, testver, config.MACHINES)
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement = 1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state(machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
def FindMasterUsingChubby(ver): """ Find the master using chubby based master election. """ return core_utils.GetGSAMaster(ver, install_utilities.is_test(ver))
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state( self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact( "mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact( "var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [ unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks ] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement=1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet( self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state( machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error( "Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
def execute(self, argv): """ This executes the service given the command line arguments. The first two argument are 'ent_home' and 'task' than is it's the children job to parse the extra args if it wants by overriding parse_args """ # Args parsing if len(argv) < 3: sys.exit(self.usage()) # Get the first two arguments and intitialize self.init_service(argv[1]) self.task = string.strip(argv[2]) # Get the other arguments and call the parsing function flags_argv = [argv[0]] flags_argv.extend(argv[3:]) self.parse_args(flags_argv) # Extra checks if not self.service_to_be_up(): sys.exit('%s not active' % self.service_name) # check if the node is enabled testver = install_utilities.is_test(self.version) if core_utils.AmIDisabled(self.version, testver): logging.error('I am disabled.') sys.exit(-1) if (self.performs_only_on_master and self.task not in ("activate", "deactivate")): if not self.local_machine_is_master: logging.error('I am not the master') self.nop() sys.exit(0) if not self.task: sys.exit(self.usage()) # Execute the operations behind a lock lockfile = "%s/%s_service_lock_%s" % (self.tmpdir, self.service_name, self.version) pidfile = "%s/%s_service_pid_%s" % (self.tmpdir, self.service_name, self.version) # Execute the task: unlocked on activate/deactivate / # locked else if self.task in ["activate", "deactivate"]: do_task(self, self.task) else: if self.check_previous_cron_job: # kill previous cron job if lockfile timestamp is too old valid_lock_duration = self.secs_to_kill_previous_job # for "stop" and "restart" task, do it immediately if self.task in ["stop", "restart"]: valid_lock_duration = 0 E.exec_locked(lockfile, 1, do_task, (self, self.task,), {}, valid_lock_duration, pidfile) else: # Lock will time out after 60 rounds of 10 seconds E.exec_locked(lockfile, 60, do_task, (self, self.task,))