示例#1
0
 def test_upgrade_negative(self):
     op = self.input.param("op", None)
     error = self.input.param("error", '')
     remote = RemoteMachineShellConnection(self.master)
     if op is None:
         self.fail("operation should be specified")
     if op == "higher_version":
         tmp = self.initial_version
         self.initial_version = self.upgrade_versions[0]
         self.upgrade_versions = [tmp, ]
     info = None
     if op == "wrong_arch":
         info = remote.extract_remote_info()
         info.architecture_type = ('x86_64', 'x86')[info.architecture_type == 'x86']
     self._install([self.master])
     self.operations([self.master])
     try:
         if op == "close_port":
             RemoteUtilHelper.enable_firewall(self.master)
         for upgrade_version in self.upgrade_versions:
             self.sleep(self.sleep_time, "Pre-setup of old version is done. Wait for upgrade to {0} version".\
                    format(upgrade_version))
             output, error = self._upgrade(upgrade_version, self.master, info=info)
             if str(output).find(error) != -1 or str(error).find(error) != -1:
                 raise Exception(error)
     except Exception, ex:
         self.log.info("Exception %s appeared as expected" % ex)
         self.log.info("Check that old version is working fine")
         self.verification([self.master])
示例#2
0
    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        for node in chosen:
            if failover_reason == 'stop_server':
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

        # define precondition check for failover
        failed_over = self.rest.fail_over(node.id, graceful=self.graceful)

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and failed_over:
            """ Check if the fail_over fails as expected """
            self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=self.graceful)
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")
示例#3
0
    def test_node_firewall_enabled(self):
        timeout = self.timeout / 2

        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        RemoteUtilHelper.enable_firewall(self.server_fail)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.disable_firewall()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(
            self.master, 0,
            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
        self.rest.rebalance(
            otpNodes=[node.id for node in self.rest.node_statuses()],
            ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name,
                                    self.loaded_items[bucket.name])
 def run_failover_operations_with_ops(self, chosen, failover_reason):
     """ Method to run fail over operations used in the test scenario based on failover reason """
     # Perform Operations relalted to failover
     failed_over = True
     for node in chosen:
         unreachable = False
         if failover_reason == 'stop_server':
             unreachable = True
             self.stop_server(node)
             self.log.info("10 seconds delay to wait for membase-server to shutdown")
             # wait for 5 minutes until node is down
             self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                 msg="node status is not unhealthy even after waiting for 5 minutes")
         elif failover_reason == "firewall":
             unreachable = True
             self.filter_list.append (node.ip)
             server = [srv for srv in self.servers if node.ip == srv.ip][0]
             RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
             status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
             if status:
                 self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
             else:
                 # verify iptables on the node if something wrong
                 for server in self.servers:
                     if server.ip == node.ip:
                         shell = RemoteMachineShellConnection(server)
                         info = shell.extract_remote_info()
                         if info.type.lower() == "windows":
                             o, r = shell.execute_command("netsh advfirewall show allprofiles")
                             shell.log_command_output(o, r)
                         else:
                             o, r = shell.execute_command("/sbin/iptables --list")
                             shell.log_command_output(o, r)
                         shell.disconnect()
                 self.rest.print_UI_logs()
                 api = self.rest.baseUrl + 'nodeStatuses'
                 status, content, header = self.rest._http_request(api)
                 json_parsed = json.loads(content)
                 self.log.info("nodeStatuses: {0}".format(json_parsed))
                 self.fail("node status is not unhealthy even after waiting for 5 minutes")
     nodes = self.filter_servers(self.servers, chosen)
     failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful)
     # Perform Compaction
     compact_tasks = []
     if self.compact:
         for bucket in self.buckets:
             compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket))
     # Run View Operations
     if self.withViewsOps:
         self.query_and_monitor_view_tasks(nodes)
     # Run mutation operations
     if self.withMutationOps:
         self.run_mutation_operations()
     failed_over.result()
     for task in compact_tasks:
         task.result()
     msg = "rebalance failed while removing failover nodes {0}".format(node.id)
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
    def test_topology_change_events(self):
        available_server_before_rebalance = copy.deepcopy(self.available_servers)
        try:
            self.log.info("Enabling firewall between Incoming node and CBAS CC "
                          "node to trigger topology_change_failed event")
            for node in available_server_before_rebalance:
                RemoteUtilHelper.enable_firewall(
                    node, bidirectional=False, xdcr=False,
                    action_on_packet="REJECT", block_ips=[self.cluster.cbas_cc_node.ip],
                    all_interface=True)

            self.log.info("Rebalancing IN CBAS node to trigger "
                          "topology_change_started event")
            rebalance_task, self.available_servers = self.rebalance_util.rebalance(
                self.cluster, kv_nodes_in=0, kv_nodes_out=0,
                cbas_nodes_in=1, cbas_nodes_out=0,
                available_servers=self.available_servers, exclude_nodes=[])

            if self.rebalance_util.wait_for_rebalance_task_to_complete(
                    rebalance_task, self.cluster, check_cbas_running=False):
                raise Exception("Rebalance passed when it should have failed.")

            self.log.info("Disabling firewall between Incoming node and CBAS CC "
                          "node and retriggering rebalance to trigger "
                          "topology_change_completed event")
            for node in available_server_before_rebalance:
                remote_client = RemoteMachineShellConnection(node)
                remote_client.disable_firewall()
                remote_client.disconnect()

            rebalance_task, self.available_servers = self.rebalance_util.rebalance(
                self.cluster, kv_nodes_in=0, kv_nodes_out=0,
                cbas_nodes_in=0, cbas_nodes_out=0,
                available_servers=self.available_servers, exclude_nodes=[])

            if not self.rebalance_util.wait_for_rebalance_task_to_complete(
                    rebalance_task, self.cluster, check_cbas_running=False):
                raise Exception("Rebalance failed even after disabling "
                                "firewall")

            self.log.info("Adding event for topology_change_started event")
            self.system_events.add_event(AnalyticsEvents.topology_change_started(
                self.cluster.cbas_cc_node.ip, 2, 0))

            self.log.info("Adding event for topology_change_failed event")
            self.system_events.add_event(AnalyticsEvents.topology_change_failed(
                self.cluster.cbas_cc_node.ip, 2, 0))

            self.log.info("Adding event for topology_change_completed event")
            self.system_events.add_event(AnalyticsEvents.topology_change_completed(
                self.cluster.cbas_cc_node.ip, 2, 0))
        except Exception as err:
            self.log.info("Disabling Firewall")
            for node in available_server_before_rebalance:
                remote_client = RemoteMachineShellConnection(node)
                remote_client.disable_firewall()
                remote_client.disconnect()
            self.fail(str(err))
示例#6
0
 def run_failover_operations_with_ops(self, chosen, failover_reason):
     """ Method to run fail over operations used in the test scenario based on failover reason """
     # Perform Operations relalted to failover
     failed_over = True
     for node in chosen:
         unreachable = False
         if failover_reason == 'stop_server':
             unreachable=True
             self.stop_server(node)
             self.log.info("10 seconds delay to wait for membase-server to shutdown")
             # wait for 5 minutes until node is down
             self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                 msg="node status is not unhealthy even after waiting for 5 minutes")
         elif failover_reason == "firewall":
             unreachable=True
             self.filter_list.append (node.ip)
             server = [srv for srv in self.servers if node.ip == srv.ip][0]
             RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
             status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
             if status:
                 self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
             else:
                 # verify iptables on the node if something wrong
                 for server in self.servers:
                     if server.ip == node.ip:
                         shell = RemoteMachineShellConnection(server)
                         info = shell.extract_remote_info()
                         if info.type.lower() == "windows":
                             o, r = shell.execute_command("netsh advfirewall show allprofiles")
                             shell.log_command_output(o, r)
                         else:
                             o, r = shell.execute_command("/sbin/iptables --list")
                             shell.log_command_output(o, r)
                         shell.disconnect()
                 self.rest.print_UI_logs()
                 api = self.rest.baseUrl + 'nodeStatuses'
                 status, content, header = self.rest._http_request(api)
                 json_parsed = json.loads(content)
                 self.log.info("nodeStatuses: {0}".format(json_parsed))
                 self.fail("node status is not unhealthy even after waiting for 5 minutes")
     nodes = self.filter_servers(self.servers,chosen)
     failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful)
     # Perform Compaction
     compact_tasks = []
     if self.compact:
         for bucket in self.buckets:
             compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket))
     # Run View Operations
     if self.withViewsOps:
         self.query_and_monitor_view_tasks(nodes)
     # Run mutation operations
     if self.withMutationOps:
         self.run_mutation_operations()
     failed_over.result()
     for task in compact_tasks:
         task.result()
     msg = "rebalance failed while removing failover nodes {0}".format(node.id)
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
示例#7
0
 def test_60s_timeout_firewall(self):
     timeout = self.timeout
     server_fail = self._servers[1]
     status = self.rest.update_autofailover_settings(True, timeout)
     if not status:
         self.fail('failed to change autofailover_settings! See MB-7282')
     self.sleep(5)
     RemoteUtilHelper.enable_firewall(server_fail)
     AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)
示例#8
0
 def test_60s_timeout_firewall(self):
     timeout = self.timeout
     server_fail = self.servers[1]
     status = self.rest.update_autofailover_settings(True, timeout)
     if not status:
         self.fail('failed to change autofailover_settings! See MB-7282')
     self.sleep(5)
     RemoteUtilHelper.enable_firewall(server_fail)
     AutoFailoverBaseTest.wait_for_failover_or_assert(
         self.master, 1,
         timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)
示例#9
0
 def test_firewall_node_when_autoreprovisioning(self):
     wait_timeout = 120
     before = self.input.param("before", True)
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     if shell.extract_remote_info().type.lower() == 'windows':
         o, r = shell.execute_command("shutdown -r -f -t 0")
     elif shell.extract_remote_info().type.lower() == 'linux':
         o, r = shell.execute_command("reboot")
     shell.log_command_output(o, r)
     if shell.extract_remote_info().type.lower() == 'windows':
         time.sleep(wait_timeout * 5)
     else:
         time.sleep(wait_timeout)
     # disable firewall on the node
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.disable_firewall()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(
         self.master, 0,
         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(),
                     "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
     # self.sleep(5)
     if before:
         RemoteUtilHelper.enable_firewall(self.servers[2])
     self.rest.rebalance(
         otpNodes=[node.id for node in self.rest.node_statuses()],
         ejectedNodes=[])
     if not before:
         RemoteUtilHelper.enable_firewall(self.servers[2])
     # self.sleep(5)
     try:
         self.rest.monitorRebalance()
         self.fail("Rebalance failed expected")
     except RebalanceFailedException:
         self.log.info("Rebalance failed but it's expected")
     shell = RemoteMachineShellConnection(self.servers[2])
     shell.disable_firewall()
     self.sleep(5)
     self.rest.rebalance(
         otpNodes=[node.id for node in self.rest.node_statuses()],
         ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name,
                                 self.loaded_items[bucket.name])
示例#10
0
 def test_60s_timeout_firewall(self):
     # AUTOFAIL_TEST_5
     timeout = self.timeout
     server_fail = self._servers[1]
     status = self.rest.update_autofailover_settings(True, timeout)
     if not status:
         self.fail('failed to change autofailover_settings!')
     self.sleep(5)
     time_start = time.time()
     RemoteUtilHelper.enable_firewall(server_fail)
     AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + self.extra_timeout, self)
     time_end = time.time()
     msg = "{0} != {1}".format(time_end - time_start, timeout + self.extra_timeout)
     self.assertTrue(abs((time_end - time_start) - timeout - self.extra_timeout) <= AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, msg)
     self.log.info("expected failover in {0} seconds, actual time {1} seconds".format(timeout, time_end - time_start))
 def test_60s_timeout_firewall(self):
     # AUTOFAIL_TEST_5
     timeout = self.timeout
     server_fail = self._servers[1]
     status = self.rest.update_autofailover_settings(True, timeout)
     if not status:
         self.fail('failed to change autofailover_settings!')
     time.sleep(5)
     time_start = time.time()
     RemoteUtilHelper.enable_firewall(server_fail)
     AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout, self)
     time_end = time.time()
     msg = "{0} != {1}".format(time_end - time_start, timeout)
     self.assertTrue(abs((time_end - time_start) - timeout) <= AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, msg)
     self.log.info("expected failover in {0} seconds, actual time {1} seconds".format(timeout, time_end - time_start))
示例#12
0
 def test_firewall_node_when_autoreprovisioning(self):
     wait_timeout = 120
     before = self.input.param("before", True)
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     if shell.extract_remote_info().type.lower() == 'windows':
         o, r = shell.execute_command("shutdown -r -f -t 0")
     elif shell.extract_remote_info().type.lower() == 'linux':
         o, r = shell.execute_command("reboot")
     shell.log_command_output(o, r)
     if shell.extract_remote_info().type.lower() == 'windows':
         time.sleep(wait_timeout * 5)
     else:
         time.sleep(wait_timeout)
     # disable firewall on the node
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.disable_firewall()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                         self)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
     # self.sleep(5)
     if before:
         RemoteUtilHelper.enable_firewall(self.servers[2])
     self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
     if not before:
         RemoteUtilHelper.enable_firewall(self.servers[2])
     # self.sleep(5)
     try:
         self.rest.monitorRebalance()
         self.fail("Rebalance failed expected")
     except RebalanceFailedException:
         self.log.info("Rebalance failed but it's expected")
     shell = RemoteMachineShellConnection(self.servers[2])
     shell.disable_firewall()
     self.sleep(5)
     self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
示例#13
0
    def test_node_firewall_enabled(self):
        timeout = self.timeout / 2

        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        RemoteUtilHelper.enable_firewall(self.server_fail)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.disable_firewall()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
示例#14
0
 def test_node_memcached_failure_in_series(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     data_lost = False
     for i in reversed(xrange(len(self.servers))):
         print self.servers[i]
         operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot'])
         shell = RemoteMachineShellConnection(self.servers[i])
         print "operation", operation
         if i == 0:
             self.master = self.servers[1]
         if operation == 'stop':
             self._stop_couchbase(self.servers[i])
         elif operation == 'memcached_failure':
             self._pause_couchbase(self.servers[i])
         elif operation == 'restart':
             shell.restart_couchbase()
         elif operation == 'failover':
             RemoteUtilHelper.enable_firewall(self.servers[i])
         elif operation == 'reboot':
             if shell.extract_remote_info().type.lower() == 'windows':
                 o, r = shell.execute_command("shutdown -r -f -t 0")
                 self.sleep(200)
             elif shell.extract_remote_info().type.lower() == 'linux':
                 o, r = shell.execute_command("reboot")
             shell.log_command_output(o, r)
             self.sleep(60)
         self.sleep(40)
         if operation == 'memcached_failure':
             AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                               timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                               self)
         if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot':
             AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                                 timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                 self)
         if operation != 'restart':
             RemoteUtilHelper.common_basic_setup([self.servers[i]])
         AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                             timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                             self)
         helper = RestHelper(RestConnection(self.master))
         self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
         self.sleep(40)
         if operation == 'memcached_failure' or operation == 'failover':
             self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         else:
             if 'kv' in self.servers[i].services and self.replicas > 0:
                 self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
                 self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
                 self.assertTrue(self.rest.monitorRebalance())
             else:
                 self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         buckets = self.rest.get_buckets()
         if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'):
             data_lost = True
         for bucket in buckets:
             if not data_lost:
                 self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
示例#15
0
    def common_test_body(self, keys_count, replica, load_ratio,
                         failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replica : {0}".format(replica))
        log.info("load_ratio : {0}".format(load_ratio))
        log.info("failover_reason : {0}".format(failover_reason))
        master = self._servers[0]
        log.info('picking server : {0} as the master'.format(master))
        rest = RestConnection(master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=master.rest_username,
                          password=master.rest_password)
        rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved)
        bucket_ram = info.memoryQuota * 2 / 3
        bucket = 'default'
        rest.create_bucket(bucket=bucket,
                           ramQuotaMB=bucket_ram,
                           replicaNumber=replica,
                           proxyPort=info.moxi)
        ready = BucketOperationHelper.wait_for_memcached(master, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        credentials = self._input.membase_settings

        ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers,
                                                       credentials, self)
        nodes = rest.node_statuses()
        rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[])
        msg = "rebalance failed after adding these nodes {0}".format(nodes)
        self.assertTrue(rest.monitorRebalance(), msg=msg)

        inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count,
                                                   load_ratio)
        inserted_count = len(inserted_keys)
        log.info('inserted {0} keys'.format(inserted_count))

        nodes = rest.node_statuses()
        while (len(nodes) - replica) > 1:
            final_replication_state = RestHelper(rest).wait_for_replication(
                900)
            msg = "replication state after waiting for up to 15 minutes : {0}"
            self.log.info(msg.format(final_replication_state))
            chosen = RebalanceHelper.pick_nodes(master, howmany=replica)
            for node in chosen:
                #let's do op
                if failover_reason == 'stop_server':
                    self.stop_server(node)
                    log.info(
                        "10 seconds delay to wait for membase-server to shutdown"
                    )
                    #wait for 5 minutes until node is down
                    self.assertTrue(
                        RestHelper(rest).wait_for_node_status(
                            node, "unhealthy", 300),
                        msg=
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
                elif failover_reason == "firewall":
                    RemoteUtilHelper.enable_firewall(
                        self._servers, node, bidirectional=self.bidirectional)
                    self.assertTrue(
                        RestHelper(rest).wait_for_node_status(
                            node, "unhealthy", 300),
                        msg=
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )

                failed_over = rest.fail_over(node.id)
                if not failed_over:
                    self.log.info(
                        "unable to failover the node the first time. try again in  60 seconds.."
                    )
                    #try again in 60 seconds
                    time.sleep(75)
                    failed_over = rest.fail_over(node.id)
                self.assertTrue(
                    failed_over, "unable to failover node after {0}".format(
                        failover_reason))
                log.info("failed over node : {0}".format(node.id))
                self._failed_nodes.append(node.ip)

            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            FailoverBaseTest.replication_verification(master, bucket, replica,
                                                      inserted_count, self)

            nodes = rest.node_statuses()
        FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(120)
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    self.sleep(180)
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen),
                                                                               self.get_failover_count()))
                    self.assertEqual(len(chosen), self.get_failover_count(), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover))
示例#17
0
    def test_cbcollect_with_redaction_enabled_with_xdcr(self):
        rest_src = RestConnection(self.master)
        rest_src.remove_all_replications()
        rest_src.remove_all_remote_clusters()

        rest_dest = RestConnection(self.servers[1])
        rest_dest_helper = RestHelper(rest_dest)

        try:
            rest_src.remove_all_replications()
            rest_src.remove_all_remote_clusters()
            self.set_redaction_level()
            rest_src.add_remote_cluster(self.servers[1].ip,
                                        self.servers[1].port,
                                        self.servers[1].rest_username,
                                        self.servers[1].rest_password, "C2")
            """ at dest cluster """
            self.add_built_in_server_user(node=self.servers[1])
            rest_dest.create_bucket(bucket='default', ramQuotaMB=512)
            bucket_ready = rest_dest_helper.vbucket_map_ready('default')
            if not bucket_ready:
                self.fail(
                    "Bucket default at dest not created after 120 seconds.")
            repl_id = rest_src.start_replication('continuous', 'default', "C2")
            if repl_id is not None:
                self.log.info("Replication created successfully")
            gen = BlobGenerator("ent-backup",
                                "ent-backup-",
                                self.value_size,
                                end=self.num_items)
            tasks = self._async_load_all_buckets(self.master, gen, "create", 0)
            for task in tasks:
                task.result()
            self.sleep(10)
            """ enable firewall """
            if self.interrupt_replication:
                RemoteUtilHelper.enable_firewall(self.master, xdcr=True)
            """ start collect logs """
            self.start_logs_collection()
            result = self.monitor_logs_collection()
            """ verify logs """
            try:
                logs_path = result["perNode"]["ns_1@" +
                                              str(self.master.ip)]["path"]
            except KeyError:
                logs_path = result["perNode"]["[email protected]"]["path"]
            redactFileName = logs_path.split('/')[-1]
            nonredactFileName = logs_path.split('/')[-1].replace(
                '-redacted', '')
            remotepath = logs_path[0:logs_path.rfind('/') + 1]
            self.verify_log_files_exist(remotepath=remotepath,
                                        redactFileName=redactFileName,
                                        nonredactFileName=nonredactFileName)
            self.log.info("Verify on log ns_server.goxdcr.log")
            self.verify_log_redaction(remotepath=remotepath,
                                      redactFileName=redactFileName,
                                      nonredactFileName=nonredactFileName,
                                      logFileName="ns_server.goxdcr.log")
        finally:
            """ clean up xdcr """
            rest_dest.delete_bucket()
            rest_src.remove_all_replications()
            rest_src.remove_all_remote_clusters()
            if self.interrupt_replication:
                shell = RemoteMachineShellConnection(self.master)
                shell.disable_firewall()
                shell.disconnect()
示例#18
0
    def test_cbcollect_with_redaction_enabled_with_xdcr(self):
        rest_src = RestConnection(self.master)
        rest_src.remove_all_replications()
        rest_src.remove_all_remote_clusters()

        rest_dest = RestConnection(self.servers[1])
        rest_dest_helper = RestHelper(rest_dest)

        try:
            rest_src.remove_all_replications()
            rest_src.remove_all_remote_clusters()
            self.set_redaction_level()
            rest_src.add_remote_cluster(self.servers[1].ip, self.servers[1].port,
                                        self.servers[1].rest_username,
                                        self.servers[1].rest_password, "C2")

            """ at dest cluster """
            self.add_built_in_server_user(node=self.servers[1])
            rest_dest.create_bucket(bucket='default', ramQuotaMB=512)
            bucket_ready = rest_dest_helper.vbucket_map_ready('default')
            if not bucket_ready:
                self.fail("Bucket default at dest not created after 120 seconds.")
            repl_id = rest_src.start_replication('continuous', 'default', "C2")
            if repl_id is not None:
                self.log.info("Replication created successfully")
            gen = BlobGenerator("ent-backup", "ent-backup-", self.value_size, end=self.num_items)
            tasks = self._async_load_all_buckets(self.master, gen, "create", 0)
            for task in tasks:
                task.result()
            self.sleep(10)

            """ enable firewall """
            if self.interrupt_replication:
                RemoteUtilHelper.enable_firewall(self.master, xdcr=True)

            """ start collect logs """
            self.start_logs_collection()
            result = self.monitor_logs_collection()
            """ verify logs """
            try:
                logs_path = result["perNode"]["ns_1@" + str(self.master.ip)]["path"]
            except KeyError:
                logs_path = result["perNode"]["[email protected]"]["path"]
            redactFileName = logs_path.split('/')[-1]
            nonredactFileName = logs_path.split('/')[-1].replace('-redacted', '')
            remotepath = logs_path[0:logs_path.rfind('/')+1]
            self.verify_log_files_exist(remotepath=remotepath,
                                    redactFileName=redactFileName,
                                    nonredactFileName=nonredactFileName)
            self.log.info("Verify on log ns_server.goxdcr.log")
            self.verify_log_redaction(remotepath=remotepath,
                                  redactFileName=redactFileName,
                                  nonredactFileName=nonredactFileName,
                                  logFileName="ns_server.goxdcr.log")
        finally:
            """ clean up xdcr """
            rest_dest.delete_bucket()
            rest_src.remove_all_replications()
            rest_src.remove_all_remote_clusters()
            if self.interrupt_replication:
                shell = RemoteMachineShellConnection(self.master)
                shell.disable_firewall()
                shell.disconnect()
示例#19
0
    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable=True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable=True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers,chosen)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) )
示例#20
0
    def common_test_body(self, keys_count, replica, load_ratio, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replica : {0}".format(replica))
        log.info("load_ratio : {0}".format(load_ratio))
        log.info("failover_reason : {0}".format(failover_reason))
        master = self._servers[0]
        log.info('picking server : {0} as the master'.format(master))
        rest = RestConnection(master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=master.rest_username,
                          password=master.rest_password)
        rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved)
        bucket_ram = info.memoryQuota * 2 / 3
        bucket = 'default'
        rest.create_bucket(bucket=bucket,
                           ramQuotaMB=bucket_ram,
                           replicaNumber=replica,
                           proxyPort=info.moxi)
        ready = BucketOperationHelper.wait_for_memcached(master, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        credentials = self._input.membase_settings

        ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers, credentials, self)
        nodes = rest.node_statuses()
        rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[])
        msg = "rebalance failed after adding these nodes {0}".format(nodes)
        self.assertTrue(rest.monitorRebalance(), msg=msg)

        inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count, load_ratio)
        inserted_count = len(inserted_keys)
        log.info('inserted {0} keys'.format(inserted_count))

        nodes = rest.node_statuses()
        while (len(nodes) - replica) > 1:
            final_replication_state = RestHelper(rest).wait_for_replication(900)
            msg = "replication state after waiting for up to 15 minutes : {0}"
            self.log.info(msg.format(final_replication_state))
            chosen = RebalanceHelper.pick_nodes(master, howmany=replica)
            for node in chosen:
                #let's do op
                if failover_reason == 'stop_server':
                    self.stop_server(node)
                    log.info("10 seconds delay to wait for membase-server to shutdown")
                    #wait for 5 minutes until node is down
                    self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
                elif failover_reason == "firewall":
                    RemoteUtilHelper.enable_firewall(self._servers, node, bidirectional=self.bidirectional)
                    self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")

                failed_over = rest.fail_over(node.id)
                if not failed_over:
                    self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                    #try again in 60 seconds
                    time.sleep(75)
                    failed_over = rest.fail_over(node.id)
                self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
                log.info("failed over node : {0}".format(node.id))
            #REMOVEME -
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(), msg=msg)
            FailoverBaseTest.replication_verification(master, bucket, replica, inserted_count, self)

            nodes = rest.node_statuses()
        FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
示例#21
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            # let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                # try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            if self.during_ops:
                self.sleep(5, "Wait for some progress in rebalance")
                if self.during_ops == "change_password":
                    old_pass = self.master.rest_password
                    self.change_password(new_password=self.input.param("new_password", "new_pass"))
                    rest = RestConnection(self.master)
                elif self.during_ops == "change_port":
                    self.change_port(new_port=self.input.param("new_port", "9090"))
                    rest = RestConnection(self.master)
            try:
                msg = "rebalance failed while removing failover nodes {0}".format(chosen)
                self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
                for failed in chosen:
                    for server in _servers_:
                        if server.ip == failed.ip:
                             _servers_.remove(server)
                             self._cleanup_nodes.append(server)

                log.info("Begin VERIFICATION ...")
                RebalanceHelper.wait_for_replication(_servers_, self.cluster)
                self.verify_cluster_stats(_servers_, self.master)
            finally:
                if self.during_ops:
                     if self.during_ops == "change_password":
                         self.change_password(new_password=old_pass)
                     elif self.during_ops == "change_port":
                         self.change_port(new_port='8091',
                                          current_port=self.input.param("new_port", "9090"))
示例#22
0
 def test_node_memcached_failure_in_series(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     data_lost = False
     for i in reversed(xrange(len(self.servers))):
         print self.servers[i]
         operation = random.choice(
             ['stop', 'memcached_failure', 'restart', 'failover', 'reboot'])
         shell = RemoteMachineShellConnection(self.servers[i])
         print "operation", operation
         if i == 0:
             self.master = self.servers[1]
         if operation == 'stop':
             self._stop_couchbase(self.servers[i])
         elif operation == 'memcached_failure':
             self._pause_couchbase(self.servers[i])
         elif operation == 'restart':
             shell.restart_couchbase()
         elif operation == 'failover':
             RemoteUtilHelper.enable_firewall(self.servers[i])
         elif operation == 'reboot':
             if shell.extract_remote_info().type.lower() == 'windows':
                 o, r = shell.execute_command("shutdown -r -f -t 0")
                 self.sleep(200)
             elif shell.extract_remote_info().type.lower() == 'linux':
                 o, r = shell.execute_command("reboot")
             shell.log_command_output(o, r)
             self.sleep(60)
         self.sleep(40)
         if operation == 'memcached_failure':
             AutoReprovisionBaseTest.wait_for_warmup_or_assert(
                 self.master, 1,
                 timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                 self)
         if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot':
             AutoReprovisionBaseTest.wait_for_failover_or_assert(
                 self.master, 1,
                 timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                 self)
         if operation != 'restart':
             RemoteUtilHelper.common_basic_setup([self.servers[i]])
         AutoReprovisionBaseTest.wait_for_failover_or_assert(
             self.master, 0,
             timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
         helper = RestHelper(RestConnection(self.master))
         self.assertTrue(helper.is_cluster_healthy(),
                         "cluster status is not healthy")
         self.sleep(40)
         if operation == 'memcached_failure' or operation == 'failover':
             self.assertTrue(helper.is_cluster_rebalanced(),
                             "cluster is not balanced")
         else:
             if 'kv' in self.servers[i].services and self.replicas > 0:
                 self.assertFalse(helper.is_cluster_rebalanced(),
                                  "cluster is balanced")
                 self.rest.rebalance(otpNodes=[
                     node.id for node in self.rest.node_statuses()
                 ],
                                     ejectedNodes=[])
                 self.assertTrue(self.rest.monitorRebalance())
             else:
                 self.assertTrue(helper.is_cluster_rebalanced(),
                                 "cluster is not balanced")
         buckets = self.rest.get_buckets()
         if self.replicas == 0 and (operation == 'restart'
                                    or operation == 'reboot'):
             data_lost = True
         for bucket in buckets:
             if not data_lost:
                 self.verify_loaded_data(self.master, bucket.name,
                                         self.loaded_items[bucket.name])
示例#23
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                         _servers_.remove(server)
                         self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        RebalanceHelper.wait_for_replication(_servers_, self.cluster)
        self.verify_cluster_stats(_servers_, self.master)
示例#24
0
    def common_test_body(self, keys_count, replica, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(replica))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master,
                               self.gen_create,
                               "create",
                               0,
                               batch_size=10000,
                               pause_secs=5,
                               timeout_secs=180)
        self._wait_for_stats_all_buckets(self._servers)

        _servers_ = self._servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        self._wait_for_replication(self._servers, timeout=600)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(
                    self._servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(
                        node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self._servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command(
                                "/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    self.assertTrue(
                        status,
                        msg=
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info(
                    "unable to failover the node the first time. try again in  60 seconds.."
                )
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(
                failed_over,
                "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                        _servers_.remove(server)
                        self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        self._wait_for_stats_all_buckets(_servers_)
        self._wait_for_replication(self._servers, timeout=600)
        self._verify_stats_all_buckets(_servers_)
        self._verify_all_buckets(self.master)
示例#25
0
 def enable_firewall(server):
     """Enable firewall
     @param server: server object to enable firewall
     @param rep_direction: replication direction unidirection/bidirection
     """
     RemoteUtilHelper.enable_firewall(server)
示例#26
0
 def start_firewall_on_node(self, node):
     """ Method to start a server which is subject to failover """
     for server in self.cluster.servers:
         if server.ip == node.ip:
             RemoteUtilHelper.enable_firewall(server)
示例#27
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("30 seconds sleep after failover before invoking rebalance...")
            time.sleep(30)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                         _servers_.remove(server)
                         self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        RebalanceHelper.wait_for_replication(_servers_, self.cluster)
        self.verify_cluster_stats(_servers_, self.master)