def save_config(self, filename): """writes a hotbackup rclone configuration file""" fhandle = self.install_prefix / filename lh.subsubsection("Writing RClone config:") print(self.get_config_json_sanitized()) fhandle.write_text(self.get_config_json()) return str(fhandle)
def terminate_instance(self): """ terminate the instance of this starter (it should kill all its managed services)""" lh.subsubsection("terminating instances for: " + str(self.name)) logging.info("StarterManager: Terminating starter instance: %s", str(self.arguments)) logging.info("This should terminate all child processes") self.instance.terminate() logging.info("StarterManager: waiting for process to exit") self.instance.wait() logging.info("StarterManager: done - moving logfile from %s to %s", str(self.log_file), str(self.basedir / "arangodb.log.old")) self.log_file.rename(self.basedir / "arangodb.log.old") for instance in self.all_instances: instance.rename_logfile() instance.detect_gone() # Clear instances as they have been stopped and the logfiles # have been moved. self.is_leader = False self.all_instances = []
def finish_setup_impl(self): # finish setup by starting the replications self.set_frontend_instances() self.checks["startReplJS"] = ( "launching replication", """ print( require("@arangodb/replication").setupReplicationGlobal({ endpoint: "%s://127.0.0.1:%s", username: "******", password: "******", verbose: false, includeSystem: true, incremental: true, autoResync: true })); print("replication started") process.exit(0); """ % ( self.get_protocol(), str(self.leader_starter_instance.get_frontend_port()), self.leader_starter_instance.get_passvoid(), ), ) lh.subsubsection("prepare leader follower replication") arangosh_script = self.checks["beforeReplJS"] logging.info( str(self.leader_starter_instance.execute_frontend( arangosh_script))) lh.subsubsection("start leader follwer replication") arangosh_script = self.checks["startReplJS"] retval = self.follower_starter_instance.execute_frontend( arangosh_script) if not retval: raise Exception("Failed to start the replication using: %s %s" % (retval, str(self.checks["startReplJS"]))) logging.info("Replication started successfully") logging.info("save document") arangosh_script = self.checks["afterReplJS"] logging.info( str(self.leader_starter_instance.execute_frontend( arangosh_script))) self.makedata_instances.append(self.leader_starter_instance)
def terminate_instance(self, keep_instances=False): """terminate the instance of this starter (it should kill all its managed services)""" lh.subsubsection("terminating instances for: " + str(self.name)) logging.info("StarterManager: Terminating starter instance: %s", str(self.arguments)) logging.info("This should terminate all child processes") self.instance.terminate() logging.info("StarterManager: waiting for process to exit") exit_code = self.instance.wait() self.add_logfile_to_report() # workaround BTS-815: starter exits 15 on the wintendo: if IS_WINDOWS and exit_code == 15: exit_code = 0 if exit_code != 0: raise Exception("Starter %s exited with %d" % (self.basedir, exit_code)) old_log = self.basedir / "arangodb.log.old" logging.info( "StarterManager: done - moving logfile from %s to %s", str(self.log_file), str(old_log), ) if old_log.exists(): old_log.unlink() self.log_file.rename(old_log) for instance in self.all_instances: instance.rename_logfile() if not instance.detect_gone(verbose=False): print("Manually terminating instance!") instance.terminate_instance(False) # Clear instances as they have been stopped and the logfiles # have been moved. if not keep_instances: self.is_leader = False self.all_instances = []
def install(self, inst): """install the package to the system""" self.progress(True, "{0} - install package".format(str(self.name))) kill_all_processes(False) if self.do_install: lh.subsubsection("installing server package") inst.install_server_package() self.cfg.set_directories(inst.cfg) lh.subsubsection("checking files") inst.check_installed_files() lh.subsubsection("saving config") inst.save_config() lh.subsubsection("checking if service is up") if inst.check_service_up(): lh.subsubsection("stopping service") inst.stop_service() inst.broadcast_bind() lh.subsubsection("outputting version") inst.output_arangod_version() inst.get_starter_version() inst.get_sync_version() lh.subsubsection("starting service") inst.start_service() inst.check_installed_paths() inst.check_engine_file() if not self.new_installer: # only install debug package for new package. self.progress(True, "installing debug package:") # start / stop if inst.check_service_up(): inst.stop_service() inst.start_service() print(inst.cfg.semver) sys_arangosh = ArangoshExecutor(inst.cfg, inst.instance) logging.debug("self test after installation") if inst.cfg.have_system_service: sys_arangosh.self_test() if self.do_system_test: sys_arangosh.js_version_check() # TODO: here we should invoke Makedata for the system installation. logging.debug("stop system service to make ports available for starter") inst.stop_service()
def jam_attempt_impl(self): # pylint: disable=too-many-statements # this is simply to slow to be worth wile: # collections = self.get_collection_list() lh.subsubsection("wait for all shards to be in sync") retval = self.starter_instances[0].execute_frontend( self.check_collections_in_sync, True) if not retval: raise Exception("Failed to ensure the cluster is in sync: %s %s" % (retval, str(self.check_collections_in_sync))) print("all in sync.") agency_leader = self.agency_get_leader() terminate_instance = 2 survive_instance = 1 if self.starter_instances[terminate_instance].have_this_instance( agency_leader): print( "Cluster instance 2 has the agency leader; killing 1 instead") terminate_instance = 1 survive_instance = 2 logging.info("stopping instance %d" % terminate_instance) uuid = self.starter_instances[terminate_instance].get_dbservers( )[0].get_uuid() self.starter_instances[terminate_instance].terminate_instance( keep_instances=True) logging.info("relaunching agent!") self.starter_instances[terminate_instance].manually_launch_instances( [InstanceType.AGENT], [], False, False) self.set_frontend_instances() prompt_user(self.basecfg, "instance stopped") if self.selenium: self.selenium.jam_step_1() ret = self.starter_instances[0].arangosh.check_test_data( "Cluster one node missing", True, ["--disabledDbserverUUID", uuid]) if not ret[0]: raise Exception("check data failed " + ret[1]) ret = self.starter_instances[ survive_instance].arangosh.check_test_data( "Cluster one node missing", True, ["--disabledDbserverUUID", uuid]) if not ret[0]: raise Exception("check data failed " + ret[1]) # respawn instance, and get its state fixed self.starter_instances[terminate_instance].respawn_instance() self.set_frontend_instances() counter = 300 while not self.starter_instances[terminate_instance].is_instance_up(): if counter <= 0: raise Exception("Instance did not respawn in 5 minutes!") progress(".") time.sleep(1) counter -= 1 print() self.starter_instances[terminate_instance].detect_instances() self.starter_instances[terminate_instance].detect_instance_pids() self.starter_instances[ terminate_instance].detect_instance_pids_still_alive() self.set_frontend_instances() logging.info("jamming: Starting instance without jwt") moreopts = ["--starter.join", "127.0.0.1:9528"] if self.cfg.ssl and not self.cfg.use_auto_certs: keyfile = self.cert_dir / Path("nodeX") / "tls.keyfile" self.generate_keyfile(keyfile) moreopts.append(f"--ssl.keyfile={keyfile}") dead_instance = StarterManager( self.basecfg, Path("CLUSTER"), "nodeX", mode="cluster", jwt_str=None, expect_instances=[ InstanceType.AGENT, InstanceType.COORDINATOR, InstanceType.DBSERVER, ], moreopts=moreopts, ) dead_instance.run_starter(expect_to_fail=True) i = 0 while True: logging.info(". %d", i) if not dead_instance.is_instance_running(): dead_instance.check_that_starter_log_contains( "Unauthorized. Wrong credentials.") break if i > 40: logging.info("Giving up wating for the starter to exit") raise Exception("non-jwt-ed starter won't exit") i += 1 time.sleep(10) logging.info(str(dead_instance.instance.wait(timeout=320))) logging.info("dead instance is dead?") prompt_user(self.basecfg, "cluster should be up") if self.selenium: self.selenium.jam_step_2()
def downgrade_arangod_version_manual_impl(self): """manual upgrade this installation""" lh.subsubsection("wait for all shards to be in sync") retval = self.starter_instances[0].execute_frontend( self.check_collections_in_sync) if not retval: raise Exception("Failed to ensure the cluster is in sync: %s %s" % (retval, str(self.check_collections_in_sync))) self.progress(True, "manual upgrade step 1 - stop instances") self.starter_instances[0].maintainance(False, InstanceType.COORDINATOR) for node in self.starter_instances: node.replace_binary_for_upgrade(self.new_cfg, False) for node in self.starter_instances: node.detect_instance_pids_still_alive() # fmt: off self.progress(True, "step 2 - upgrade agents") for node in self.starter_instances: node.upgrade_instances( [InstanceType.AGENT], [ '--database.auto-upgrade', 'true', '--log.foreground-tty', 'true' ], # mitigate 3.6x agency shutdown issues: self.cfg.version >= arangoversions['370']) self.progress(True, "step 3 - upgrade db-servers") for node in self.starter_instances: node.upgrade_instances([InstanceType.DBSERVER], [ '--database.auto-upgrade', 'true', '--log.foreground-tty', 'true' ]) self.progress(True, "step 4 - coordinator upgrade") # now the new cluster is running. we will now run the coordinator upgrades for node in self.starter_instances: logging.info("upgrading coordinator instances\n" + str(node)) node.upgrade_instances([InstanceType.COORDINATOR], [ '--database.auto-upgrade', 'true', '--javascript.copy-installation', 'true', '--server.rest-server', 'false', ]) # fmt: on self.progress(True, "step 5 restart the full cluster ") for node in self.starter_instances: node.respawn_instance() self.progress(True, "step 6 wait for the cluster to be up") for node in self.starter_instances: node.detect_instances() node.wait_for_version_reply() # now the upgrade should be done. for node in self.starter_instances: node.detect_instances() node.wait_for_version_reply() node.probe_leader() self.set_frontend_instances() self.starter_instances[0].maintainance(False, InstanceType.COORDINATOR) if self.selenium: self.selenium.test_wait_for_upgrade() # * 5s