def SetInitState(cfg, state): """Sets system's initialization state. For oneway, it stores it in C.ENT_SYSTEM_INIT_STATE. For Clusters, it stores it in chubby file /ls/ent<version>/ENT_SYSTEM_INIT_STATE. @param cfg - of type configurator. @param state - string """ # oneway? if 1 == len(core_utils.GetNodes()): cfg.setGlobalParam(C.ENT_SYSTEM_INIT_STATE, state) return tmpfile = E.mktemp('/export/hda3/tmp') try: f = open(tmpfile, 'w') f.write(state) f.close() except IOError: logging.fatal('Cannot write to temp file %s' % tmpfile) return version = cfg.getGlobalParam('VERSION') lockserv_cmd_prefix = core_utils.GetLSClientCmd(version, is_test(version)) chubby_root_dir = '/ls/%s' % core_utils.GetCellName(version) write_cmd = '%s cp %s %s/%s' % (lockserv_cmd_prefix, tmpfile, chubby_root_dir, 'ENT_SYSTEM_INIT_STATE') logging.info('setting system init state to: %s', state) E.exe_or_fail(write_cmd) E.exe('rm -rf %s' % tmpfile)
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = ( ". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % (self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def remove(self, machine): """ This removes a machine from the configuration """ if machine not in self.cfg.getGlobalParam('MACHINES'): logging.error("%s doesn't exist" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # if possible stop the core services, ignore return code install_utilities.stop_core(ver, home, [machine]) if machine == E.getCrtHostName(): logging.error("Cannot remove self") return 1 # Halt the machine if APC is used. error = self.halt(machine) self.cfg.globalParams.ReplaceVarInParam("SERVERS", None, machine) self.cfg.globalParams.ReplaceVarInParam("MACHINES", None, machine) ret = core_utils.AddDeadNode(ver, testver, machine) # remove the chunkserver running on the node gfs_utils.DeleteGFSChunkservers(ver, testver, [machine]) if ret: logging.error('Cannot add dead node to the lockserver.') # we ignore this error for now # now we need to remove the data disks that were on this machine data_disks = self.cfg.globalParams.var_copy('DATACHUNKDISKS') if data_disks.has_key(machine): del data_disks[machine] if not self.cfg.setGlobalParam('DATACHUNKDISKS', data_disks): return 1 # This also saves the config file if not self.cfg.DoMachineAllocation(): return 1 # Now we need to restart babysitter because the old one # is out of sync after this serve_service_cmd = (". %s && " "cd %s/local/google3/enterprise/legacy/scripts && " "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'))) E.exe("%s %s" % (serve_service_cmd, "babysit")) self.restart_crawl_processes(serve_service_cmd) if not mail_already_sent(M.MSG_MACHINEREMOVED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEREMOVED % machine, "", true) return error
def start_service(self, service): """ Activates and starts the given service on the given machine """ logging.info("ACTIVATE: %s service %s on %s" % (service, self.version_, self.machine_)) cmd = E.nonblocking_cmd("/etc/rc.d/init.d/%s_%s activate" % (service, self.version_)) ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd))) if ret == 0: logging.info("START: %s service %s on %s" % (service, self.version_, self.machine_)) cmd = E.nonblocking_cmd("/etc/rc.d/init.d/%s_%s start" % (service, self.version_)) ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd))) return ret
def start_service(self, service): """ Activates and starts the given service on the given machine """ logging.info("ACTIVATE: %s service %s on %s" % ( service, self.version_, self.machine_)) cmd = E.nonblocking_cmd( "/etc/rc.d/init.d/%s_%s activate" % (service, self.version_)) ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd))) if ret == 0: logging.info("START: %s service %s on %s" % ( service, self.version_, self.machine_)) cmd = E.nonblocking_cmd( "/etc/rc.d/init.d/%s_%s start" % (service, self.version_)) ret = E.exe("ssh %s %s" % (self.machine_, commands.mkarg(cmd))) return ret
def EnsureDirectory(self, fileutil_args, path): logging.info("ensuring directory %s" % path) if string.find(path, "/gfs/") == 0: # string starts with /gfs/ so path is gfs directory cmnd = "fileutil %s -f mkdir -p %s" % (fileutil_args, path) else: cmnd = "mkdir -p %s" % path res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res))
def restart_babysitter(self): 'Returns bool - true on success and false on failure' serve_service_cmd = ". %s && " \ "cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s " % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) # Run babysitter in the background. return E.exe("%s %s &" % (serve_service_cmd, "babysit")) == 0
def delete_pagerank_barriers(self): """ Deletes barrier files used by pr_main """ # Sanity check to see we are indeed running pr_main pr_prog = self.cp.var('PAGERANKER_PROG') if pr_prog != 'pr_main': logging.fatal('Not using pr_main anymore') # Get all required parameter from entconfig barrier_name_prefix = '%s/barriers' % self.cp.var('NAMESPACE_PREFIX') datadir = self.cp.var('DATADIR') gfs_aliases = self.cp.var('GFS_ALIASES') # Nuke'em. When there is only a single pr_main running (shards=1), it # does this during its startup. cmd = ('%s/fileutil --datadir=%s --gfs_aliases=%s ' '--bnsresolver_use_svelte=false ' ' rm -f %s.barrier_progpr_*_of_*_op*_iter*' % (self.bin_dir, datadir, gfs_aliases, barrier_name_prefix)) logging.info('Deleting barriers - %s' % cmd) E.exe(cmd)
def createDataDirs(self, machines, onlyifneeded=false, node_replacement=0): """ Create the directories for an index. Note that this function will get executed when a node is added back to the cluster. Input: onlyifneeded: If true, createDataDirs will only proceed if necessary; if search.config is missing, we assume enterprise-data needs to be re-created @return boolean Success status """ if onlyifneeded: # The presence or absence of search.config indicates if we need to # re-create the directories config = '%s/search.config' % self.getGlobalParam(C.DATADIR) if E.access(machines, config, 'f'): logging.info('createDataDirs: search.config already exists') return true else: logging.info( "createDataDirs: search.config doesn't exist; re-creating") logging.info("Create enterprise datadir...") if not data_directory.create( self.getGlobalParam(C.DATADISK), self.getGlobalParam("ENT_BIGFILE_DATADISKS"), self.getGlobalParam(C.DATACHUNK_PREFIX), "enterprise", self.getGlobalParam(C.BIN_DIRS), machines): logging.error("Error creating datadir") return false logging.info("Create querycache datadir...") if not data_directory.create( "%s/../querycache" % self.getGlobalParam(C.DATADISK), self.getGlobalParam("ENT_BIGFILE_DATADISKS"), "%s/../querycache" % self.getGlobalParam(C.DATACHUNK_PREFIX), "cache", self.getGlobalParam(C.BIN_DIRS), machines): logging.error("Error creating datadir") return false # Create FEEDS_DIR and FEED_STATUS_DIR for one-way if not self.getGlobalParam(C.GFS_CELL): cmnd = "mkdir -p %s; mkdir -p %s" % (self.getGlobalParam( 'FEEDS_DIR'), self.getGlobalParam('FEED_STATUS_DIR')) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # Create ONEBOX_LOGS_DIR for one-way if not self.getGlobalParam(C.GFS_CELL): cmnd = "mkdir -p %s" % self.getGlobalParam('ONEBOX_LOGS_DIR') res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # Create directory for rt-index cache if self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR'): d = self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR') out = [] cmd = "mkdir -p %s; test -d %s" % (d, d) if E.ERR_OK != E.execute(machines, cmd, out, false): logging.error( "Error creating cache directory for rtslave: %s" % out) return false # Ram cache directory is the mount point itself, so we don't need to create it. #self.getGlobalParam('RTSLAVE_RAM_DIR_FOR_INDEX_CACHING') return true
def createDataDirs(self, machines, onlyifneeded=false, node_replacement=0): """ Create the directories for an index. Note that this function will get executed when a node is added back to the cluster. Input: onlyifneeded: If true, createDataDirs will only proceed if necessary; if search.config is missing, we assume enterprise-data needs to be re-created @return boolean Success status """ if onlyifneeded: # The presence or absence of search.config indicates if we need to # re-create the directories config = '%s/search.config' % self.getGlobalParam(C.DATADIR) if E.access(machines, config, 'f'): logging.info('createDataDirs: search.config already exists') return true else: logging.info("createDataDirs: search.config doesn't exist; re-creating") logging.info("Create enterprise datadir...") if not data_directory.create( self.getGlobalParam(C.DATADISK), self.getGlobalParam("ENT_BIGFILE_DATADISKS"), self.getGlobalParam(C.DATACHUNK_PREFIX), "enterprise", self.getGlobalParam(C.BIN_DIRS), machines): logging.error("Error creating datadir") return false logging.info("Create querycache datadir...") if not data_directory.create( "%s/../querycache" % self.getGlobalParam(C.DATADISK), self.getGlobalParam("ENT_BIGFILE_DATADISKS"), "%s/../querycache" % self.getGlobalParam(C.DATACHUNK_PREFIX), "cache", self.getGlobalParam(C.BIN_DIRS), machines): logging.error("Error creating datadir") return false # Create FEEDS_DIR and FEED_STATUS_DIR for one-way if not self.getGlobalParam(C.GFS_CELL): cmnd = "mkdir -p %s; mkdir -p %s" % (self.getGlobalParam('FEEDS_DIR'), self.getGlobalParam('FEED_STATUS_DIR')) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # Create ONEBOX_LOGS_DIR for one-way if not self.getGlobalParam(C.GFS_CELL): cmnd = "mkdir -p %s" % self.getGlobalParam('ONEBOX_LOGS_DIR') res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # Create directory for rt-index cache if self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR'): d = self.getGlobalParam('RTSLAVE_LOCAL_CACHE_DIR') out = [] cmd = "mkdir -p %s; test -d %s" % (d,d) if E.ERR_OK != E.execute(machines, cmd, out, false): logging.error("Error creating cache directory for rtslave: %s" % out) return false # Ram cache directory is the mount point itself, so we don't need to create it. #self.getGlobalParam('RTSLAVE_RAM_DIR_FOR_INDEX_CACHING') return true
def restart_crawl_processes(self, serve_service_cmd): # lets restart a few crawl related servers # so that the bringup is quick components = "--components=pr_main,urlmanager,"\ "urlserver,bot,contentfilter" E.exe("%s %s %s" % (serve_service_cmd, "start", components))
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state( self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact( "mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact( "var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [ unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks ] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement=1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet( self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state( machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error( "Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
def add(self, machine, apc_outlet): """ This adds a machine to the configuration """ # We can add a machine only when we are in active state if install_utilities.install_state(self.cfg.getGlobalParam('VERSION')) != "ACTIVE": logging.error("Can add a machine only when we are in active state") return 1 # First test for accessibility of the machine. if E.execute([machine], 'echo 1', None, 1) != E.ERR_OK: logging.error("Could not ssh into the machine %s" % machine) return 1 # start the svs on the remote machine restart_svs_cmd = "%s/local/google3/enterprise/legacy/util/svs_utilities.py %s %s" % ( self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine) if E.execute([E.getCrtHostName()], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-p2", restart_svs_cmd), None, 0) != E.ERR_OK: logging.error("Could not start svs on machine %s" % machine) return 1 # wait for some time for svs to come up time.sleep(5) # check to see if the svs is up and is the right version if not svs_utilities.PingAndCheckSvsVersion( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), machine): logging.error("Svs not running correctly on machine %s" % machine) return 1 ver = self.cfg.getGlobalParam('VERSION') home = self.cfg.getGlobalParam('ENTERPRISE_HOME') testver = install_utilities.is_test(ver) # update MACHINES machines = self.cfg.getGlobalParam('MACHINES') if machine not in machines: machines.append(machine) self.cfg.setGlobalParam('MACHINES', machines) ret = core_utils.RemDeadNode(ver, testver, machine) if ret: logging.error('Cannot remove dead node from lockserver.') # we ignore this error for now # We just added a new machine into the config # this will lead to a change in concentrator config # so we need to re-run serve service which will # write the new config and restart the concentrator serve_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./serve_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) E.exe("%s %s" % (serve_cmd, "babysit")) num_tries = 5 cur_try = 0 while cur_try < num_tries: cur_try = cur_try + 1 all_disks = self.cfg.mach_param_cache.GetFact("mounted-drives", machine) bad_disks = self.cfg.mach_param_cache.GetFact("var_log_badhds", machine) if bad_disks and all_disks: break time.sleep(60) if all_disks == None or bad_disks == None: logging.error("Could not get machine information about %s" % machine) return 1 bad_disks = string.split(bad_disks, ' ') all_disks = string.split(all_disks, ' ') good_disks = filter(lambda x, y=bad_disks: x not in y, all_disks) good_disks = map(lambda x: "%s3" % x, good_disks) # change sda3 to hda3 etc. good_disks = map(lambda x: re.sub(r'^s', 'h', x), good_disks) # Preprocess disks before adding to remove duplicates. unique_good_disks = [] [unique_good_disks.append(disk) for disk in good_disks if disk not in unique_good_disks] # Add disks self.updatedisk(machine, unique_good_disks, true) # apc map update apc_map = self.cfg.globalParams.var_copy('APC_MAP') apc_map[machine] = apc_util.PortMap(apc_outlet) if not self.cfg.setGlobalParam('APC_MAP', apc_map): logging.error("ERROR setting apc map to %s" % repr(apc_map)) return 1 # create appropriate datadirs on that machine if not self.cfg.createDataDirs([machine], node_replacement = 1): logging.error("ERROR could not create datadirs on machine %s" % machine) return 1 # Replicate the config self.cfg.replicateConfigOnMachine(machine) # Reconfigure net on the target machine if not reconfigurenet_util.doReconfigureNet(self.cfg.globalParams, [machine], i_am_master=0): logging.error('reconfigurenet failed for %s' % machine) return 1 # Start core services on the new node if not install_utilities.start_core(ver, home, [machine], ignore=0): logging.error("ERROR could not start core services on %s" % machine) return 1 # Add the chunkserver back gfs_utils.AddGFSChunkservers(ver, testver, [machine]) # first we need to do Machine allocation. # this will assign things that will satisfy the constraints if not self.cfg.DoMachineAllocation(serversets=['workqueue-slave']): logging.error("ERROR doing machine allocation") return 1 # now try to relllocate some servers from existing machines to the new machine replaced = self.cfg.AllocateServersToNewMachine(machine) if not replaced: logging.error("ERROR allocating services to the new machine") return 1 # first we need to restart the babysitter E.exe("%s %s" % (serve_cmd, "babysit")) time.sleep(60) # Now we need to stop all the replaced services for server_string in replaced: server = serverlib.Server() server.InitFromName(server_string) replaced_type = server.servertype() kill_cmd = servertype.GetKillCmd(replaced_type, server.port()) if E.execute([server.host()], kill_cmd, None, 1) != E.ERR_OK: logging.error("ERROR killing %s running on port %d on %s" % \ (replaced_type, server.port(), server.host())) # we should make it active if not install_utilities.set_install_state(machine, self.cfg.getGlobalParam('ENTERPRISE_HOME'), "ACTIVE"): logging.error("ERROR changing state on machine %s. " "Please make it active and activate and " "start crawl service on it" % machine) return 1 crawl_cmd = ". %s && cd %s/local/google3/enterprise/legacy/scripts && " \ "./crawl_service.py %s" % ( self.cfg.getGlobalParam('ENTERPRISE_BASHRC'), self.cfg.getGlobalParam('ENTERPRISE_HOME'), self.cfg.getGlobalParam('ENTERPRISE_HOME')) if E.execute([machine], "%s %s" % (crawl_cmd, "start"), None, 1) != E.ERR_OK: logging.error("Could not start crawl service on %s" % machine) return 1 # save all the params self.cfg.saveParams() # for faster crawl recovery, lets restart all crawl processes self.restart_crawl_processes(serve_cmd) # activate the crawl and logcontrol service on the remote machine crawl_activate_cmd = "/etc/rc.d/init.d/crawl_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", crawl_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate crawl service on machine %s" % machine) logging.error("Please activate by hand") return 1 log_activate_cmd = "/etc/rc.d/init.d/logcontrol_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", log_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate logcontrol service on machine %s" % machine) logging.error("Please activate by hand") return 1 serve_activate_cmd = "/etc/rc.d/init.d/serve_%s activate >&/dev/null" \ "</dev/null" % self.cfg.getGlobalParam('VERSION') if E.execute([machine], SECURE_WRAPPER_COMMAND % ( \ self.cfg.getGlobalParam('ENTERPRISE_HOME'), "-e", serve_activate_cmd), None, 0) != E.ERR_OK: logging.error("Could not activate serve service on machine %s" % machine) logging.error("Please activate by hand") return 1 logging.info("Machine %s successfully added into the system" % machine) if not mail_already_sent(M.MSG_MACHINEADDED % machine): SendMail.send(self.cfg, None, false, M.MSG_MACHINEADDED % machine, "", true) return 0
def EnsureSpellingData(self, reset = 0): """ This ensures that initial spelling data is present. If reset is set we clear ENT_SPELL_SERVING_ID and revert files to initial state. """ logging.info("ensuring presence of initial spelling data") serving_id_cfg_name = 'ENT_SPELL_SERVING_ID' # if reset is set - blow away runtime dictionary version. (this is # useful after index has been reset). if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1): self.setGlobalParam(serving_id_cfg_name, 0); if (self.hasGlobalParam(serving_id_cfg_name)) and \ (self.getGlobalParam(serving_id_cfg_name) == 0): fileutil_args = "" if self.hasGlobalParam('GFS_ALIASES'): fileutil_args = "--gfs_aliases='%s'" % \ self.getGlobalParam('GFS_ALIASES') fileutil_args += " --bnsresolver_use_svelte=false" if self.hasGlobalParam('DATADIR'): fileutil_args = "%s --datadir=%s" % \ (fileutil_args, self.getGlobalParam('DATADIR')) # note: assumes that the parent of spell_root exists spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR') end = len(spell_root) - 1 if spell_root[end] == '/': spell_root = spell_root[0:end] target_path = "%s/spell-0" % \ self.getGlobalParam('ENT_SPELL_ROOT_DIR') self.EnsureDirectory(fileutil_args, spell_root) self.EnsureDirectory(fileutil_args, "%s" % spell_root) self.EnsureDirectory(fileutil_args, "%s" % target_path) logging.info("ensuring files") if not self.hasGlobalParam('ENTERPRISE_HOME'): logging.fatal("No ENTERPRISE_HOME config parameter") return src_path = "%s/../spelling-data/runtime" % \ self.getGlobalParam('ENTERPRISE_HOME') cmnd = "(cd %s ; " % src_path cmnd = cmnd + "for f in *.spelling.* ; " cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \ (fileutil_args, src_path, target_path) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res) ) # ensure spelling data is present num_src_files = self.CountSpellingFiles(fileutil_args, src_path) logging.info("There are %d spelling files in the source directory" % \ num_src_files) num_target_files = \ self.CountSpellingFiles(fileutil_args, target_path) logging.info("There are %d spelling files in the target directory"% \ num_target_files) if num_src_files == num_target_files: logging.info("spelling data present") else: logging.fatal("failed to ensure presence of spelling data") return else: logging.info("no config param %s, or it's not 0" % serving_id_cfg_name) logging.info("skipping spelling data check")
def EnsureSpellingData(self, reset=0): """ This ensures that initial spelling data is present. If reset is set we clear ENT_SPELL_SERVING_ID and revert files to initial state. """ logging.info("ensuring presence of initial spelling data") serving_id_cfg_name = 'ENT_SPELL_SERVING_ID' # if reset is set - blow away runtime dictionary version. (this is # useful after index has been reset). if self.hasGlobalParam(serving_id_cfg_name) and (reset == 1): self.setGlobalParam(serving_id_cfg_name, 0) if (self.hasGlobalParam(serving_id_cfg_name)) and \ (self.getGlobalParam(serving_id_cfg_name) == 0): fileutil_args = "" if self.hasGlobalParam('GFS_ALIASES'): fileutil_args = "--gfs_aliases='%s'" % \ self.getGlobalParam('GFS_ALIASES') fileutil_args += " --bnsresolver_use_svelte=false" if self.hasGlobalParam('DATADIR'): fileutil_args = "%s --datadir=%s" % \ (fileutil_args, self.getGlobalParam('DATADIR')) # note: assumes that the parent of spell_root exists spell_root = self.getGlobalParam('ENT_SPELL_ROOT_DIR') end = len(spell_root) - 1 if spell_root[end] == '/': spell_root = spell_root[0:end] target_path = "%s/spell-0" % \ self.getGlobalParam('ENT_SPELL_ROOT_DIR') self.EnsureDirectory(fileutil_args, spell_root) self.EnsureDirectory(fileutil_args, "%s" % spell_root) self.EnsureDirectory(fileutil_args, "%s" % target_path) logging.info("ensuring files") if not self.hasGlobalParam('ENTERPRISE_HOME'): logging.fatal("No ENTERPRISE_HOME config parameter") return src_path = "%s/../spelling-data/runtime" % \ self.getGlobalParam('ENTERPRISE_HOME') cmnd = "(cd %s ; " % src_path cmnd = cmnd + "for f in *.spelling.* ; " cmnd = cmnd + "do fileutil %s -f cp %s/$f %s/$f; done)" % \ (fileutil_args, src_path, target_path) res = E.exe(cmnd) logging.info("Result of command %s is %d" % (cmnd, res)) # ensure spelling data is present num_src_files = self.CountSpellingFiles(fileutil_args, src_path) logging.info("There are %d spelling files in the source directory" % \ num_src_files) num_target_files = \ self.CountSpellingFiles(fileutil_args, target_path) logging.info("There are %d spelling files in the target directory"% \ num_target_files) if num_src_files == num_target_files: logging.info("spelling data present") else: logging.fatal("failed to ensure presence of spelling data") return else: logging.info("no config param %s, or it's not 0" % serving_id_cfg_name) logging.info("skipping spelling data check")