Exemplo n.º 1
0
    def kill_servers(self, leader, replicas, N):
        """Kill a subset of servers in order to simulate failures.

        Args:
          leader (str): hostname of current leader.
          replicas (list): list of replica hostnames.
          N (int): Number of hosts (including leader) to stop.

        Returns:
          kill_list: list of hosts that were stopped.

        """
        kill_list = set(random.sample(replicas, N))
        if leader not in kill_list:
            kill_list.pop()
            kill_list.add(leader)
        self.log.info("*** stopping leader (%s) + %d others", leader, N-1)
        stop_processes(kill_list,
                       self.server_managers[0].manager.job.command_regex)

        kill_ranks = self.server_managers[0].get_host_ranks(kill_list)
        self.assertGreaterEqual(len(kill_ranks), len(kill_list),
            "Unable to obtain expected ranks for {}".format(kill_list))
        self.server_managers[0].update_expected_states(
            kill_ranks, ["stopped", "excluded"])

        return kill_list
Exemplo n.º 2
0
    def stop_leftover_processes(self, processes, hosts):
        """Stop leftover processes on the specified hosts before starting tests.

        Args:
            processes (list): list of process names to stop
            hosts (list): list of hosts on which to stop the leftover processes
        """
        if processes:
            self.log.info(
                "Stopping any of the following commands left running on %s: %s",
                hosts, ",".join(processes))
            stop_processes(hosts, "'({})'".format("|".join(processes)))
Exemplo n.º 3
0
 def kill(self):
     """Forcibly terminate any sub process running on hosts."""
     regex = self.manager.job.command_regex
     result = stop_processes(self._hosts, regex)
     if 0 in result and len(result) == 1:
         print("No remote {} processes killed (none found), done.".format(
             regex))
     else:
         print(
             "***At least one remote {} process needed to be killed! Please "
             "investigate/report.***".format(regex))
Exemplo n.º 4
0
 def kill(self):
     """Forcibly terminate any job processes running on hosts."""
     regex = self.job.command_regex
     result = stop_processes(self._hosts, regex)
     if 0 in result and len(result) == 1:
         self.log.info("No remote %s processes killed (none found), done.",
                       regex)
     else:
         self.log.info(
             "***At least one remote %s process needed to be killed! Please "
             "investigate/report.***", regex)
Exemplo n.º 5
0
 def kill(self):
     """Forcibly terminate any server process running on hosts."""
     regex = self.manager.job.command_regex
     # Try to dump all server's ULTs stacks before kill.
     result = stop_processes(self._hosts, regex)
     if 0 in result and len(result) == 1:
         print("No remote {} server processes killed (none found), done.".format(regex))
     else:
         print(
             "***At least one remote {} server process needed to be killed! "
             "Please investigate/report.***".format(regex))
     # set stopped servers state to make teardown happy
     self.update_expected_states(None, ["stopped", "excluded", "errored"])
Exemplo n.º 6
0
    def test_daos_server_dump_basic(self):
        """JIRA ID: DAOS-1452.

        Test Description: Test engine ULT stacks dump.

        :avocado: tags=all,daily_regression
        :avocado: tags=vm
        :avocado: tags=control,server_start,basic
        :avocado: tags=daos_server_dump_test,test_daos_server_dump_basic
        """

        ret_codes = stop_processes(self.hostlist_servers,
                                   r"daos_engine",
                                   added_filter=r"'\<(grep|defunct)\>'",
                                   dump_ult_stacks=True)
        if 1 in ret_codes:
            print("Stopped daos_engine processes on {}".format(
                str(ret_codes[1])))
        if 0 in ret_codes:
            print("No daos_engine processes found on {}".format(
                str(ret_codes[0])))

        # XXX may need to check for one file per engine...
        ret_codes = pcmd(self.hostlist_servers, r"ls /tmp/daos_dump*.txt")
        # Report any failures
        if len(ret_codes) > 1 or 0 not in ret_codes:
            failed = [
                "{}: rc={}".format(val, key) for key, val in ret_codes.items()
                if key != 0
            ]
            self.fail("no ULT stacks dump found on following hosts: {}".format(
                ", ".join(failed)))

        self.log.info("Test passed!")

        # set stopped servers state to make teardown happy
        self.server_managers[0].update_expected_states(
            None, ["stopped", "excluded", "errored"])
Exemplo n.º 7
0
 def cleanup_processes(self):
     """Clean up cart processes, in case avocado/apricot does not."""
     error_list = []
     localhost = socket.gethostname().split(".")[0:1]
     processes = r"'\<(crt_launch|orterun)\>'"
     retry_count = 0
     while retry_count < 2:
         result = stop_processes(localhost,
                                 processes,
                                 added_filter=r"'\<(grep|defunct)\>'")
         if 1 in result:
             self.log.info(
                 "Stopped '%s' processes on %s", processes, str(result[1]))
             retry_count += 1
         elif 0 in result:
             self.log.info("All '%s' processes have been stopped", processes)
             retry_count = 99
         else:
             error_list.append("Error detecting/stopping cart processes")
             retry_count = 99
     if retry_count == 2:
         error_list.append("Unable to stop cart processes!")
     return error_list
Exemplo n.º 8
0
 def cleanup_processes():
     """ Clean up cart processes, in case avocado/apricot does not. """
     stop_processes(["localhost"], "'(crt_launch|orterun)'")
Exemplo n.º 9
0
    def test_agent_failure_isolation(self):
        """Jira ID: DAOS-9385.

        1. Create a pool and a container.
        2. Run IOR from the two client nodes.
        3. Stop daos_agent process while IOR is running on one of the clients.
        4. Wait until both of the IOR ends.
        5. Check that there's error on the kill client, but not on the keep client.
        6. On the killed client, verify journalctl shows the log that the agent is
        stopped.
        7. On the other client where agent is still running, verify that the journalctl
        doesn't show that the agent is stopped.
        8. Restart both daos_agent.
        9. Run IOR again from the keep client. It should succeed without any error.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,large
        :avocado: tags=deployment,fault_management,agent_failure
        :avocado: tags=agent_failure_isolation
        """
        # 1. Create a pool and a container.
        self.add_pool()
        self.add_container(self.pool)

        agent_hosts = self.agent_managers[0].hosts
        self.log.info("agent_hosts = %s", agent_hosts)
        agent_host_keep = agent_hosts[0]
        agent_host_kill = agent_hosts[1]

        # 2. Run IOR from the two client nodes.
        ior_results = {}
        job_num_keep = 1
        job_num_kill = 2
        self.log.info("Run IOR with thread")
        thread_1 = threading.Thread(
            target=self.run_ior_collect_error,
            args=[ior_results, job_num_keep, "test_file_1", [agent_host_keep]])
        thread_2 = threading.Thread(
            target=self.run_ior_collect_error,
            args=[ior_results, job_num_kill, "test_file_2", [agent_host_kill]])

        self.log.info("Start IOR 1 (thread)")
        thread_1.start()
        thread_2.start()

        # We need to stop daos_agent while IOR is running, so need to wait for a few
        # seconds for IOR to start.
        self.log.info("Waiting 5 sec for IOR to start writing data...")
        time.sleep(5)

        errors = []

        # 3. Stop daos_agent process while IOR is running on one of the clients.
        since = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        self.log.info("Stopping agent on %s", agent_host_kill)
        pattern = self.agent_managers[0].manager.job.command_regex
        result = stop_processes(hosts=[agent_host_kill], pattern=pattern)
        if 0 in result and len(result) == 1:
            msg = "No daos_agent process killed from {}!".format(agent_host_kill)
            errors.append(msg)
        else:
            self.log.info("daos_agent in %s killed", agent_host_kill)
        until = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        # 4. Wait until both of the IOR thread ends.
        thread_1.join()
        thread_2.join()

        # 5. Check that there's error on the kill client, but not on the keep client.
        self.log.info("--- IOR results Kill ---")
        self.log.info(ior_results[job_num_kill])
        if ior_results[job_num_kill][0]:
            errors.append("IOR on agent kill host worked!")

        self.log.info("--- IOR results Keep ---")
        self.log.info(ior_results[job_num_keep])
        if not ior_results[job_num_keep][0]:
            ior_error = ior_results[job_num_keep][-1]
            errors.append("Error found in IOR on keep client! {}".format(ior_error))

        # 6. On the killed client, verify journalctl shows the log that the agent is
        # stopped.
        results = get_journalctl(
            hosts=[agent_host_kill], since=since, until=until,
            journalctl_type="daos_agent")
        self.log.info("journalctl results (kill) = %s", results)
        if "shutting down" not in results[0]["data"]:
            msg = ("Agent shut down message not found in journalctl on killed client! "
                   "Output = {}".format(results))
            errors.append(msg)

        # 7. On the other client where agent is still running, verify that the journalctl
        # in the previous step doesn't show that the agent is stopped.
        results = get_journalctl(
            hosts=[agent_host_keep], since=since, until=until,
            journalctl_type="daos_agent")
        self.log.info("journalctl results (keep) = %s", results)
        if "shutting down" in results[0]["data"]:
            msg = ("Agent shut down message found in journalctl on keep client! "
                   "Output = {}".format(results))
            errors.append(msg)

        # 8. Restart both daos_agent. (Currently, there's no clean way to restart one.)
        self.start_agent_managers()

        # 9. Run IOR again from the keep client. It should succeed without any error.
        self.log.info("--- Start IOR 2 ---")
        self.run_ior_collect_error(
            job_num=job_num_keep, results=ior_results, file_name="test_file_3",
            clients=agent_hosts)

        # Verify that there's no error.
        self.log.info("--- IOR results 2 ---")
        self.log.info(ior_results[job_num_keep])
        if not ior_results[job_num_keep][0]:
            ior_error = ior_results[job_num_keep][-1]
            errors.append("Error found in second IOR run! {}".format(ior_error))

        self.log.info("########## Errors ##########")
        report_errors(test=self, errors=errors)
        self.log.info("############################")
Exemplo n.º 10
0
 def kill(self):
     """Forcably terminate any sub process running on hosts."""
     stop_processes(self._hosts, "'({})'".format("|".join(self._exe_names)))