예제 #1
0
 def save_config(self, filename):
     """writes a hotbackup rclone configuration file"""
     fhandle = self.install_prefix / filename
     lh.subsubsection("Writing RClone config:")
     print(self.get_config_json_sanitized())
     fhandle.write_text(self.get_config_json())
     return str(fhandle)
예제 #2
0
    def terminate_instance(self):
        """ terminate the instance of this starter
            (it should kill all its managed services)"""

        lh.subsubsection("terminating instances for: " + str(self.name))
        logging.info("StarterManager: Terminating starter instance: %s",
                     str(self.arguments))

        logging.info("This should terminate all child processes")
        self.instance.terminate()
        logging.info("StarterManager: waiting for process to exit")
        self.instance.wait()

        logging.info("StarterManager: done - moving logfile from %s to %s",
                     str(self.log_file),
                     str(self.basedir / "arangodb.log.old"))
        self.log_file.rename(self.basedir / "arangodb.log.old")

        for instance in self.all_instances:
            instance.rename_logfile()
            instance.detect_gone()
        # Clear instances as they have been stopped and the logfiles
        # have been moved.
        self.is_leader = False
        self.all_instances = []
예제 #3
0
    def finish_setup_impl(self):
        # finish setup by starting the replications
        self.set_frontend_instances()

        self.checks["startReplJS"] = (
            "launching replication",
            """
print(
require("@arangodb/replication").setupReplicationGlobal({
    endpoint: "%s://127.0.0.1:%s",
    username: "******",
    password: "******",
    verbose: false,
    includeSystem: true,
    incremental: true,
    autoResync: true
    }));
print("replication started")
process.exit(0);
""" % (
                self.get_protocol(),
                str(self.leader_starter_instance.get_frontend_port()),
                self.leader_starter_instance.get_passvoid(),
            ),
        )
        lh.subsubsection("prepare leader follower replication")
        arangosh_script = self.checks["beforeReplJS"]
        logging.info(
            str(self.leader_starter_instance.execute_frontend(
                arangosh_script)))

        lh.subsubsection("start leader follwer replication")
        arangosh_script = self.checks["startReplJS"]
        retval = self.follower_starter_instance.execute_frontend(
            arangosh_script)
        if not retval:
            raise Exception("Failed to start the replication using: %s %s" %
                            (retval, str(self.checks["startReplJS"])))

        logging.info("Replication started successfully")

        logging.info("save document")
        arangosh_script = self.checks["afterReplJS"]
        logging.info(
            str(self.leader_starter_instance.execute_frontend(
                arangosh_script)))
        self.makedata_instances.append(self.leader_starter_instance)
예제 #4
0
    def terminate_instance(self, keep_instances=False):
        """terminate the instance of this starter
        (it should kill all its managed services)"""

        lh.subsubsection("terminating instances for: " + str(self.name))
        logging.info("StarterManager: Terminating starter instance: %s", str(self.arguments))

        logging.info("This should terminate all child processes")
        self.instance.terminate()
        logging.info("StarterManager: waiting for process to exit")
        exit_code = self.instance.wait()
        self.add_logfile_to_report()
        # workaround BTS-815: starter exits 15 on the wintendo:
        if IS_WINDOWS and exit_code == 15:
            exit_code = 0

        if exit_code != 0:
            raise Exception("Starter %s exited with %d" % (self.basedir, exit_code))

        old_log = self.basedir / "arangodb.log.old"
        logging.info(
            "StarterManager: done - moving logfile from %s to %s",
            str(self.log_file),
            str(old_log),
        )
        if old_log.exists():
            old_log.unlink()
        self.log_file.rename(old_log)

        for instance in self.all_instances:
            instance.rename_logfile()
            if not instance.detect_gone(verbose=False):
                print("Manually terminating instance!")
                instance.terminate_instance(False)
        # Clear instances as they have been stopped and the logfiles
        # have been moved.
        if not keep_instances:
            self.is_leader = False
            self.all_instances = []
예제 #5
0
    def install(self, inst):
        """install the package to the system"""
        self.progress(True, "{0} - install package".format(str(self.name)))

        kill_all_processes(False)
        if self.do_install:
            lh.subsubsection("installing server package")
            inst.install_server_package()
            self.cfg.set_directories(inst.cfg)
            lh.subsubsection("checking files")
            inst.check_installed_files()
            lh.subsubsection("saving config")
            inst.save_config()

            lh.subsubsection("checking if service is up")
            if inst.check_service_up():
                lh.subsubsection("stopping service")
                inst.stop_service()
            inst.broadcast_bind()
            lh.subsubsection("outputting version")
            inst.output_arangod_version()
            inst.get_starter_version()
            inst.get_sync_version()

            lh.subsubsection("starting service")

            inst.start_service()

            inst.check_installed_paths()
            inst.check_engine_file()

            if not self.new_installer:
                # only install debug package for new package.
                self.progress(True, "installing debug package:")

        # start / stop
        if inst.check_service_up():
            inst.stop_service()
        inst.start_service()
        print(inst.cfg.semver)
        sys_arangosh = ArangoshExecutor(inst.cfg, inst.instance)

        logging.debug("self test after installation")
        if inst.cfg.have_system_service:
            sys_arangosh.self_test()

        if self.do_system_test:
            sys_arangosh.js_version_check()
            # TODO: here we should invoke Makedata for the system installation.

            logging.debug("stop system service to make ports available for starter")
            inst.stop_service()
예제 #6
0
    def jam_attempt_impl(self):
        # pylint: disable=too-many-statements
        # this is simply to slow to be worth wile:
        # collections = self.get_collection_list()
        lh.subsubsection("wait for all shards to be in sync")
        retval = self.starter_instances[0].execute_frontend(
            self.check_collections_in_sync, True)
        if not retval:
            raise Exception("Failed to ensure the cluster is in sync: %s %s" %
                            (retval, str(self.check_collections_in_sync)))
        print("all in sync.")
        agency_leader = self.agency_get_leader()
        terminate_instance = 2
        survive_instance = 1
        if self.starter_instances[terminate_instance].have_this_instance(
                agency_leader):
            print(
                "Cluster instance 2 has the agency leader; killing 1 instead")
            terminate_instance = 1
            survive_instance = 2

        logging.info("stopping instance %d" % terminate_instance)
        uuid = self.starter_instances[terminate_instance].get_dbservers(
        )[0].get_uuid()
        self.starter_instances[terminate_instance].terminate_instance(
            keep_instances=True)
        logging.info("relaunching agent!")
        self.starter_instances[terminate_instance].manually_launch_instances(
            [InstanceType.AGENT], [], False, False)

        self.set_frontend_instances()

        prompt_user(self.basecfg, "instance stopped")
        if self.selenium:
            self.selenium.jam_step_1()

        ret = self.starter_instances[0].arangosh.check_test_data(
            "Cluster one node missing", True, ["--disabledDbserverUUID", uuid])
        if not ret[0]:
            raise Exception("check data failed " + ret[1])

        ret = self.starter_instances[
            survive_instance].arangosh.check_test_data(
                "Cluster one node missing", True,
                ["--disabledDbserverUUID", uuid])
        if not ret[0]:
            raise Exception("check data failed " + ret[1])

        # respawn instance, and get its state fixed
        self.starter_instances[terminate_instance].respawn_instance()
        self.set_frontend_instances()
        counter = 300
        while not self.starter_instances[terminate_instance].is_instance_up():
            if counter <= 0:
                raise Exception("Instance did not respawn in 5 minutes!")
            progress(".")
            time.sleep(1)
            counter -= 1
        print()
        self.starter_instances[terminate_instance].detect_instances()
        self.starter_instances[terminate_instance].detect_instance_pids()
        self.starter_instances[
            terminate_instance].detect_instance_pids_still_alive()
        self.set_frontend_instances()

        logging.info("jamming: Starting instance without jwt")
        moreopts = ["--starter.join", "127.0.0.1:9528"]
        if self.cfg.ssl and not self.cfg.use_auto_certs:
            keyfile = self.cert_dir / Path("nodeX") / "tls.keyfile"
            self.generate_keyfile(keyfile)
            moreopts.append(f"--ssl.keyfile={keyfile}")
        dead_instance = StarterManager(
            self.basecfg,
            Path("CLUSTER"),
            "nodeX",
            mode="cluster",
            jwt_str=None,
            expect_instances=[
                InstanceType.AGENT,
                InstanceType.COORDINATOR,
                InstanceType.DBSERVER,
            ],
            moreopts=moreopts,
        )
        dead_instance.run_starter(expect_to_fail=True)

        i = 0
        while True:
            logging.info(". %d", i)
            if not dead_instance.is_instance_running():
                dead_instance.check_that_starter_log_contains(
                    "Unauthorized. Wrong credentials.")
                break
            if i > 40:
                logging.info("Giving up wating for the starter to exit")
                raise Exception("non-jwt-ed starter won't exit")
            i += 1
            time.sleep(10)
        logging.info(str(dead_instance.instance.wait(timeout=320)))
        logging.info("dead instance is dead?")

        prompt_user(self.basecfg, "cluster should be up")
        if self.selenium:
            self.selenium.jam_step_2()
예제 #7
0
    def downgrade_arangod_version_manual_impl(self):
        """manual upgrade this installation"""
        lh.subsubsection("wait for all shards to be in sync")
        retval = self.starter_instances[0].execute_frontend(
            self.check_collections_in_sync)
        if not retval:
            raise Exception("Failed to ensure the cluster is in sync: %s %s" %
                            (retval, str(self.check_collections_in_sync)))
        self.progress(True, "manual upgrade step 1 - stop instances")
        self.starter_instances[0].maintainance(False, InstanceType.COORDINATOR)
        for node in self.starter_instances:
            node.replace_binary_for_upgrade(self.new_cfg, False)
        for node in self.starter_instances:
            node.detect_instance_pids_still_alive()

        # fmt: off
        self.progress(True, "step 2 - upgrade agents")
        for node in self.starter_instances:
            node.upgrade_instances(
                [InstanceType.AGENT],
                [
                    '--database.auto-upgrade', 'true', '--log.foreground-tty',
                    'true'
                ],
                # mitigate 3.6x agency shutdown issues:
                self.cfg.version >= arangoversions['370'])
        self.progress(True, "step 3 - upgrade db-servers")
        for node in self.starter_instances:
            node.upgrade_instances([InstanceType.DBSERVER], [
                '--database.auto-upgrade', 'true', '--log.foreground-tty',
                'true'
            ])
        self.progress(True, "step 4 - coordinator upgrade")
        # now the new cluster is running. we will now run the coordinator upgrades
        for node in self.starter_instances:
            logging.info("upgrading coordinator instances\n" + str(node))
            node.upgrade_instances([InstanceType.COORDINATOR], [
                '--database.auto-upgrade',
                'true',
                '--javascript.copy-installation',
                'true',
                '--server.rest-server',
                'false',
            ])
        # fmt: on
        self.progress(True, "step 5 restart the full cluster ")
        for node in self.starter_instances:
            node.respawn_instance()
        self.progress(True, "step 6 wait for the cluster to be up")
        for node in self.starter_instances:
            node.detect_instances()
            node.wait_for_version_reply()

        # now the upgrade should be done.
        for node in self.starter_instances:
            node.detect_instances()
            node.wait_for_version_reply()
            node.probe_leader()
        self.set_frontend_instances()
        self.starter_instances[0].maintainance(False, InstanceType.COORDINATOR)

        if self.selenium:
            self.selenium.test_wait_for_upgrade()  # * 5s