class AutoRetryFailedRebalance(RebalanceBaseTest):
    def setUp(self):
        super(AutoRetryFailedRebalance, self).setUp()
        self.rest = RestConnection(self.servers[0])
        self.sleep_time = self.input.param("sleep_time", 15)
        self.enabled = self.input.param("enabled", True)
        self.afterTimePeriod = self.input.param("afterTimePeriod", 300)
        self.maxAttempts = self.input.param("maxAttempts", 1)
        self.log.info("Changing the retry rebalance settings ....")
        self.change_retry_rebalance_settings(
            enabled=self.enabled,
            afterTimePeriod=self.afterTimePeriod,
            maxAttempts=self.maxAttempts)
        self.rebalance_operation = self.input.param("rebalance_operation",
                                                    "rebalance_out")
        self.disable_auto_failover = self.input.param("disable_auto_failover",
                                                      True)
        self.auto_failover_timeout = self.input.param("auto_failover_timeout",
                                                      120)
        if self.disable_auto_failover:
            self.rest.update_autofailover_settings(False, 120)
        else:
            self.rest.update_autofailover_settings(True,
                                                   self.auto_failover_timeout)

    def tearDown(self):
        self.reset_retry_rebalance_settings()
        # Reset to default value
        super(AutoRetryFailedRebalance, self).tearDown()
        rest = RestConnection(self.servers[0])
        zones = rest.get_zone_names()
        for zone in zones:
            if zone != "Group 1":
                rest.delete_zone(zone)

    def test_auto_retry_of_failed_rebalance_where_failure_happens_before_rebalance(
            self):
        before_rebalance_failure = self.input.param("before_rebalance_failure",
                                                    "stop_server")
        # induce the failure before the rebalance starts
        self._induce_error(before_rebalance_failure)
        self.sleep(self.sleep_time)
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(before_rebalance_failure)
            self.check_retry_rebalance_succeeded()
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.start_server(self.servers[1])
            self.stop_firewall_on_node(self.servers[1])

    def test_auto_retry_of_failed_rebalance_where_failure_happens_during_rebalance(
            self):
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            self.check_retry_rebalance_succeeded()
        else:
            # This is added as the failover task is not throwing exception
            if self.rebalance_operation == "graceful_failover":
                # Recover from the error
                self._recover_from_error(during_rebalance_failure)
                self.check_retry_rebalance_succeeded()
            else:
                self.fail(
                    "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
                )
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.start_server(self.servers[1])
            self.stop_firewall_on_node(self.servers[1])

    def test_auto_retry_of_failed_rebalance_does_not_get_triggered_when_rebalance_is_stopped(
            self):
        operation = self._rebalance_operation(self.rebalance_operation)
        reached = RestHelper(self.rest).rebalance_reached(30)
        self.assertTrue(reached,
                        "Rebalance failed or did not reach {0}%".format(30))
        self.rest.stop_rebalance(wait_timeout=self.sleep_time)
        result = json.loads(self.rest.get_pending_rebalance_info())
        self.log.info(result)
        retry_rebalance = result["retry_rebalance"]
        if retry_rebalance != "not_pending":
            self.fail(
                "Auto-retry succeeded even when Rebalance was stopped by user")

    def test_negative_auto_retry_of_failed_rebalance_where_rebalance_will_be_cancelled(
            self):
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        post_failure_operation = self.input.param("post_failure_operation",
                                                  "cancel_pending_rebalance")
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            result = json.loads(self.rest.get_pending_rebalance_info())
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            rebalance_id = result["rebalance_id"]
            if retry_rebalance != "pending":
                self.fail("Auto-retry of failed rebalance is not triggered")
            if post_failure_operation == "cancel_pending_rebalance":
                # cancel pending rebalance
                self.log.info(
                    "Cancelling rebalance Id: {0}".format(rebalance_id))
                self.rest.cancel_pending_rebalance(rebalance_id)
            elif post_failure_operation == "disable_auto_retry":
                # disable the auto retry of the failed rebalance
                self.log.info(
                    "Disable the the auto retry of the failed rebalance")
                self.change_retry_rebalance_settings(enabled=False)
            elif post_failure_operation == "retry_failed_rebalance_manually":
                # retry failed rebalance manually
                self.log.info(
                    "Retrying failed rebalance Id: {0}".format(rebalance_id))
                self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
            else:
                self.fail("Invalid post_failure_operation option")
            # Now check and ensure retry won't happen
            result = json.loads(self.rest.get_pending_rebalance_info())
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            if retry_rebalance != "not_pending":
                self.fail("Auto-retry of failed rebalance is not cancelled")
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.start_server(self.servers[1])
            self.stop_firewall_on_node(self.servers[1])

    def test_negative_auto_retry_of_failed_rebalance_where_rebalance_will_not_be_cancelled(
            self):
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        post_failure_operation = self.input.param("post_failure_operation",
                                                  "create_delete_buckets")
        zone_name = "Group_{0}_{1}".format(random.randint(1, 1000000000),
                                           self._testMethodName)
        zone_name = zone_name[0:60]
        default_zone = "Group 1"
        moved_node = []
        moved_node.append(self.servers[1].ip)
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            result = json.loads(self.rest.get_pending_rebalance_info())
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            if retry_rebalance != "pending":
                self.fail("Auto-retry of failed rebalance is not triggered")
            if post_failure_operation == "create_delete_buckets":
                # delete buckets and create new one
                BucketOperationHelper.delete_all_buckets_or_assert(
                    servers=self.servers, test_case=self)
                self.sleep(self.sleep_time)
                BucketOperationHelper.create_bucket(self.master,
                                                    test_case=self)
            elif post_failure_operation == "change_replica_count":
                # change replica count
                self.log.info("Changing replica count of buckets")
                for bucket in self.buckets:
                    self.rest.change_bucket_props(bucket, replicaNumber=2)
            elif post_failure_operation == "change_server_group":
                # change server group
                self.log.info("Creating new zone " + zone_name)
                self.rest.add_zone(zone_name)
                self.log.info("Moving {0} to new zone {1}".format(
                    moved_node, zone_name))
                status = self.rest.shuffle_nodes_in_zones(
                    moved_node, default_zone, zone_name)
            else:
                self.fail("Invalid post_failure_operation option")
            # In these failure scenarios while the retry is pending, then the retry will be attempted but fail
            try:
                self.check_retry_rebalance_succeeded()
            except Exception as e:
                self.log.info(e)
                if "Retrying of rebalance still did not help. All the retries exhausted" not in str(
                        e):
                    self.fail(
                        "Auto retry of failed rebalance succeeded when it was expected to fail"
                    )
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if post_failure_operation == "change_server_group":
                status = self.rest.shuffle_nodes_in_zones(
                    moved_node, zone_name, default_zone)
                self.log.info(
                    "Shuffle the node back to default group . Status : {0}".
                    format(status))
                self.sleep(self.sleep_time)
                self.log.info("Deleting new zone " + zone_name)
                try:
                    self.rest.delete_zone(zone_name)
                except:
                    self.log.info("Errors in deleting zone")
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.start_server(self.servers[1])
            self.stop_firewall_on_node(self.servers[1])

    def test_auto_retry_of_failed_rebalance_with_rebalance_test_conditions(
            self):
        test_failure_condition = self.input.param("test_failure_condition")
        # induce the failure before the rebalance starts
        self._induce_rebalance_test_condition(test_failure_condition)
        self.sleep(self.sleep_time)
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Delete the rebalance test condition so that we recover from the error
            self._delete_rebalance_test_condition(test_failure_condition)
            self.check_retry_rebalance_succeeded()
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self._delete_rebalance_test_condition(test_failure_condition)

    def test_auto_retry_of_failed_rebalance_with_autofailvoer_enabled(self):
        before_rebalance_failure = self.input.param("before_rebalance_failure",
                                                    "stop_server")
        # induce the failure before the rebalance starts
        self._induce_error(before_rebalance_failure)
        try:
            operation = self._rebalance_operation(self.rebalance_operation)
            operation.result()
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            if self.auto_failover_timeout < self.afterTimePeriod:
                self.sleep(self.auto_failover_timeout)
                result = json.loads(self.rest.get_pending_rebalance_info())
                self.log.info(result)
                retry_rebalance = result["retry_rebalance"]
                if retry_rebalance != "not_pending":
                    self.fail(
                        "Auto-failover did not cancel pending retry of the failed rebalance"
                    )
            else:
                try:
                    self.check_retry_rebalance_succeeded()
                except Exception as e:
                    if "Retrying of rebalance still did not help" not in str(
                            e):
                        self.fail(
                            "retry rebalance succeeded even without failover")
                    self.sleep(self.auto_failover_timeout)
                    self.cluster.rebalance(self.servers[:self.nodes_init], [],
                                           [])
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.start_server(self.servers[1])
            self.stop_firewall_on_node(self.servers[1])

    def _rebalance_operation(self, rebalance_operation):
        self.log.info("Starting rebalance operation of type : {0}".format(
            rebalance_operation))
        if rebalance_operation == "rebalance_out":
            operation = self.cluster.async_rebalance(
                self.servers[:self.nodes_init], [], self.servers[1:])
        elif rebalance_operation == "rebalance_in":
            operation = self.cluster.async_rebalance(
                self.servers[:self.nodes_init],
                [self.servers[self.nodes_init]], [])
        elif rebalance_operation == "swap_rebalance":
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password,
                               self.servers[self.nodes_init].ip,
                               self.servers[self.nodes_init].port)
            operation = self.cluster.async_rebalance(
                self.servers[:self.nodes_init], [],
                [self.servers[self.nodes_init - 1]])
        elif rebalance_operation == "graceful_failover":
            # TODO : retry for graceful failover is not yet implemented
            operation = self.cluster.async_failover(
                [self.master],
                failover_nodes=[self.servers[1]],
                graceful=True,
                wait_for_pending=120)
        return operation

    def _induce_error(self, error_condition):
        if error_condition == "stop_server":
            self.stop_server(self.servers[1])
        elif error_condition == "enable_firewall":
            self.start_firewall_on_node(self.servers[1])
        elif error_condition == "kill_memcached":
            self.kill_server_memcached(self.servers[1])
        elif error_condition == "reboot_server":
            shell = RemoteMachineShellConnection(self.servers[1])
            shell.reboot_node()
        elif error_condition == "kill_erlang":
            shell = RemoteMachineShellConnection(self.servers[1])
            shell.kill_erlang()
            self.sleep(self.sleep_time * 3)
        else:
            self.fail("Invalid error induce option")

    def _recover_from_error(self, error_condition):
        if error_condition == "stop_server" or error_condition == "kill_erlang":
            self.start_server(self.servers[1])
            self.sleep(self.sleep_time * 4)
        elif error_condition == "enable_firewall":
            self.stop_firewall_on_node(self.servers[1])
        elif error_condition == "reboot_server":
            self.sleep(self.sleep_time * 4)
            # wait till node is ready after warmup
            ClusterOperationHelper.wait_for_ns_servers_or_assert(
                [self.servers[1]], self, wait_if_warmup=True)

    def _induce_rebalance_test_condition(self, test_failure_condition):
        if test_failure_condition == "verify_replication":
            set_command = "testconditions:set(verify_replication, {fail, \"" + "default" + "\"})"
        elif test_failure_condition == "backfill_done":
            set_command = "testconditions:set(backfill_done, {for_vb_move, \"" + "default\", 1 , " + "fail})"
        else:
            set_command = "testconditions:set({0}, fail)".format(
                test_failure_condition)
        get_command = "testconditions:get({0})".format(test_failure_condition)
        for server in self.servers:
            rest = RestConnection(server)
            _, content = rest.diag_eval(set_command)
            self.log.info("Command : {0} Return : {1}".format(
                set_command, content))

        for server in self.servers:
            rest = RestConnection(server)
            _, content = rest.diag_eval(get_command)
            self.log.info("Command : {0} Return : {1}".format(
                get_command, content))

    def _delete_rebalance_test_condition(self, test_failure_condition):
        delete_command = "testconditions:delete({0})".format(
            test_failure_condition)
        get_command = "testconditions:get({0})".format(test_failure_condition)
        for server in self.servers:
            rest = RestConnection(server)
            _, content = rest.diag_eval(delete_command)
            self.log.info("Command : {0} Return : {1}".format(
                delete_command, content))

        for server in self.servers:
            rest = RestConnection(server)
            _, content = rest.diag_eval(get_command)
            self.log.info("Command : {0} Return : {1}".format(
                get_command, content))
示例#2
0
class AutoRetryFailedRebalance(RebalanceBaseTest):
    def setUp(self):
        super(AutoRetryFailedRebalance, self).setUp()
        self.rest = RestConnection(self.cluster.master)
        self.sleep_time = self.input.param("sleep_time", 15)
        self.enabled = self.input.param("enabled", True)
        self.afterTimePeriod = self.input.param("afterTimePeriod", 300)
        self.maxAttempts = self.input.param("maxAttempts", 1)
        self.log.info("Changing the retry rebalance settings ....")
        self.change_retry_rebalance_settings(
            enabled=self.enabled,
            afterTimePeriod=self.afterTimePeriod,
            maxAttempts=self.maxAttempts)
        self.rebalance_operation = self.input.param("rebalance_operation",
                                                    "rebalance_out")
        self.disable_auto_failover = self.input.param("disable_auto_failover",
                                                      True)
        self.auto_failover_timeout = self.input.param("auto_failover_timeout",
                                                      120)
        if self.disable_auto_failover:
            self.rest.update_autofailover_settings(False, 120)
        else:
            self.rest.update_autofailover_settings(True,
                                                   self.auto_failover_timeout)
        self.cb_collect_failure_nodes = dict()
        # To support data load during auto retry op
        self.data_load = self.input.param("data_load", False)
        self.rebalance_failed_msg = "Rebalance failed as expected"

    def tearDown(self):
        self.reset_retry_rebalance_settings()
        self.cbcollect_info()
        # Reset to default value
        super(AutoRetryFailedRebalance, self).tearDown()
        rest = RestConnection(self.servers[0])
        zones = rest.get_zone_names()
        for zone in zones:
            if zone != "Group 1":
                rest.delete_zone(zone)

    def __update_cbcollect_expected_node_failures(self, nodes, reason):
        for node in nodes:
            self.cb_collect_failure_nodes[node.ip] = reason

    def set_retry_exceptions(self, doc_loading_spec):
        retry_exceptions = list()
        retry_exceptions.append(SDKException.AmbiguousTimeoutException)
        retry_exceptions.append(SDKException.TimeoutException)
        retry_exceptions.append(SDKException.RequestCanceledException)
        retry_exceptions.append(SDKException.DocumentNotFoundException)
        retry_exceptions.append(SDKException.ServerOutOfMemoryException)
        if self.durability_level:
            retry_exceptions.append(SDKException.DurabilityAmbiguousException)
            retry_exceptions.append(SDKException.DurabilityImpossibleException)
        doc_loading_spec[MetaCrudParams.RETRY_EXCEPTIONS] = retry_exceptions

    def async_data_load(self):
        doc_loading_spec = self.bucket_util.get_crud_template_from_package(
            "volume_test_load")
        self.set_retry_exceptions(doc_loading_spec)
        tasks = self.bucket_util.run_scenario_from_spec(
            self.task,
            self.cluster,
            self.cluster.buckets,
            doc_loading_spec,
            mutation_num=0,
            async_load=True,
            batch_size=self.batch_size,
            process_concurrency=self.process_concurrency)
        return tasks

    def data_validation(self, tasks):
        self.task.jython_task_manager.get_task_result(tasks)
        self.bucket_util.validate_doc_loading_results(tasks)
        if tasks.result is False:
            self.fail("Doc_loading failed")

        self.cluster_util.print_cluster_stats(self.cluster)
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets,
                                                     timeout=1200)
        self.bucket_util.validate_docs_per_collections_all_buckets(
            self.cluster)
        self.bucket_util.print_bucket_stats(self.cluster)

    def test_auto_retry_of_failed_rebalance_where_failure_happens_before_rebalance(
            self):
        tasks = None
        before_rebalance_failure = self.input.param("before_rebalance_failure",
                                                    "stop_server")
        # induce the failure before the rebalance starts
        self._induce_error(before_rebalance_failure)
        self.sleep(self.sleep_time)

        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with: {0}".format(str(e)))
            # Trigger cbcollect after rebalance failure
            self.cbcollect_info(trigger=True,
                                validate=False,
                                known_failures=self.cb_collect_failure_nodes)
            # Recover from the error
            self._recover_from_error(before_rebalance_failure)
            if self.data_load:
                tasks = self.async_data_load()
            self.check_retry_rebalance_succeeded()
            # Validate cbcollect result after rebalance retry
            self.cbcollect_info(trigger=False,
                                validate=True,
                                known_failures=self.cb_collect_failure_nodes)
            if self.data_load:
                self.data_validation(tasks)
        else:
            self.fail("Rebalance did not fail as expected. "
                      "Hence could not validate auto-retry feature..")
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.cluster_util.start_server(self.cluster, self.servers[1])
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def test_auto_retry_of_failed_rebalance_where_failure_happens_during_rebalance(
            self):
        tasks = None
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with: {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            if self.data_load:
                tasks = self.async_data_load()
            self.check_retry_rebalance_succeeded()
            if self.data_load:
                self.data_validation(tasks)
        else:
            # This is added as the failover task is not throwing exception
            if self.rebalance_operation == "graceful_failover":
                # Recover from the error
                self._recover_from_error(during_rebalance_failure)
                if self.data_load:
                    tasks = self.async_data_load()
                self.check_retry_rebalance_succeeded()
                if self.data_load:
                    self.data_validation(tasks)
            else:
                self.fail("Rebalance did not fail as expected. "
                          "Hence could not validate auto-retry feature..")
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.cluster_util.start_server(self.cluster, self.servers[1])
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def test_auto_retry_of_failed_rebalance_does_not_get_triggered_when_rebalance_is_stopped(
            self):
        _ = self._rebalance_operation(self.rebalance_operation)
        reached = self.cluster_util.rebalance_reached(self.rest, 30)
        self.assertTrue(reached, "Rebalance failed or did not reach 30%")
        # Trigger cbcollect before interrupting the rebalance
        self.cbcollect_info(trigger=True, validate=False)
        self.rest.stop_rebalance(wait_timeout=self.sleep_time)
        result = json.loads(self.rest.get_pending_rebalance_info())
        self.log.info(result)
        # Validate cbcollect results
        self.cbcollect_info(trigger=False, validate=True)
        retry_rebalance = result["retry_rebalance"]
        if retry_rebalance != "not_pending":
            self.fail(
                "Auto-retry succeeded even when Rebalance was stopped by user")

    def test_negative_auto_retry_of_failed_rebalance_where_rebalance_will_be_cancelled(
            self):
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        post_failure_operation = self.input.param("post_failure_operation",
                                                  "cancel_pending_rebalance")
        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with: %s" % e)
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            # TODO : Data load at this stage fails;
            # if self.data_load:
            #     tasks = self.async_data_load()
            result = json.loads(self.rest.get_pending_rebalance_info())
            # if self.data_load:
            #     self.data_validation(tasks)
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            rebalance_id = result["rebalance_id"]
            if retry_rebalance != "pending":
                self.fail("Auto-retry of failed rebalance is not triggered")
            if post_failure_operation == "cancel_pending_rebalance":
                # cancel pending rebalance
                self.log.info("Cancelling rebalance-id: %s" % rebalance_id)
                self.rest.cancel_pending_rebalance(rebalance_id)
            elif post_failure_operation == "disable_auto_retry":
                # disable the auto retry of the failed rebalance
                self.log.info("Disable the auto retry of the failed rebalance")
                self.change_retry_rebalance_settings(enabled=False)
            elif post_failure_operation == "retry_failed_rebalance_manually":
                # retry failed rebalance manually
                self.log.info("Retrying failed rebalance id %s" % rebalance_id)
                self.cluster.rebalance(self.servers[:self.nodes_init], [], [])
            else:
                self.fail("Invalid post_failure_operation option")
            # Now check and ensure retry won't happen
            result = json.loads(self.rest.get_pending_rebalance_info())
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            if retry_rebalance != "not_pending":
                self.fail("Auto-retry of failed rebalance is not cancelled")
        else:
            self.fail("Rebalance did not fail as expected. "
                      "Hence could not validate auto-retry feature..")
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.cluster_util.start_server(self.cluster, self.servers[1])
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def test_negative_auto_retry_of_failed_rebalance_where_rebalance_will_not_be_cancelled(
            self):
        during_rebalance_failure = self.input.param("during_rebalance_failure",
                                                    "stop_server")
        post_failure_operation = self.input.param("post_failure_operation",
                                                  "create_delete_buckets")
        zone_name = "Group_{0}_{1}".format(random.randint(1, 1000000000),
                                           self._testMethodName)
        zone_name = zone_name[0:60]
        default_zone = "Group 1"
        moved_node = [self.servers[1].ip]
        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.sleep(self.sleep_time)
            # induce the failure during the rebalance
            self._induce_error(during_rebalance_failure)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with : {0}".format(str(e)))
            # Recover from the error
            self._recover_from_error(during_rebalance_failure)
            result = json.loads(self.rest.get_pending_rebalance_info())
            self.log.info(result)
            retry_rebalance = result["retry_rebalance"]
            if retry_rebalance != "pending":
                self.fail("Auto-retry of failed rebalance is not triggered")
            # if post_failure_operation == "create_delete_buckets":
            #     # delete buckets and create new one
            #     BucketOperationHelper.delete_all_buckets_or_assert(servers=self.servers, test_case=self)
            #     self.sleep(self.sleep_time)
            #     BucketOperationHelper.create_bucket(self.master, test_case=self)

            # Start cbcollect only if auto-retry of rebalance is triggered
            self.cbcollect_info(trigger=True, validate=False)
            if post_failure_operation == "change_replica_count":
                # change replica count
                self.log.info("Changing replica count of buckets")
                for bucket in self.cluster.buckets:
                    self.bucket_util.update_bucket_property(
                        self.cluster.master, bucket, replica_number=2)
            elif post_failure_operation == "change_server_group":
                # change server group
                self.log.info("Creating new zone " + zone_name)
                self.rest.add_zone(zone_name)
                self.log.info("Moving {0} to new zone {1}".format(
                    moved_node, zone_name))
                _ = self.rest.shuffle_nodes_in_zones(moved_node, default_zone,
                                                     zone_name)
            else:
                self.fail("Invalid post_failure_operation option")
            # In these failure scenarios while the retry is pending,
            # then the retry will be attempted but fail
            try:
                self.check_retry_rebalance_succeeded()
                # Validate cbcollect results
                self.cbcollect_info(trigger=False, validate=True)
            except Exception as e:
                self.log.info(e)
                # Wait for cbstat to complete before asserting
                self.cbcollect_info(trigger=False, validate=True)
                if "Retrying of rebalance still did not help. All the retries exhausted" not in str(
                        e):
                    self.fail(
                        "Auto retry of failed rebalance succeeded when it was expected to fail"
                    )
        else:
            self.fail(
                "Rebalance did not fail as expected. Hence could not validate auto-retry feature.."
            )
        finally:
            if post_failure_operation == "change_server_group":
                status = self.rest.shuffle_nodes_in_zones(
                    moved_node, zone_name, default_zone)
                self.log.info("Shuffle the node back to default group. "
                              "Status: %s" % status)
                self.sleep(self.sleep_time)
                self.log.info("Deleting new zone " + zone_name)
                try:
                    self.rest.delete_zone(zone_name)
                except:
                    self.log.info("Errors in deleting zone")
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.cluster_util.start_server(self.cluster, self.servers[1])
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def test_auto_retry_of_failed_rebalance_with_rebalance_test_conditions(
            self):
        tasks = None
        test_failure_condition = self.input.param("test_failure_condition")
        # induce the failure before the rebalance starts
        self._induce_rebalance_test_condition(test_failure_condition)
        self.sleep(self.sleep_time)
        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with: %s" % e)
            # Delete the rebalance test condition to recover from the error
            self._delete_rebalance_test_condition(test_failure_condition)
            if self.data_load:
                tasks = self.async_data_load()
            self.check_retry_rebalance_succeeded()
            if self.data_load:
                self.data_validation(tasks)
        else:
            self.fail("Rebalance did not fail as expected. "
                      "Hence could not validate auto-retry feature..")
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self._delete_rebalance_test_condition(test_failure_condition)

    def test_auto_retry_of_failed_rebalance_with_autofailvoer_enabled(self):
        before_rebalance_failure = self.input.param("before_rebalance_failure",
                                                    "stop_server")
        # induce the failure before the rebalance starts
        self._induce_error(before_rebalance_failure)
        try:
            rebalance = self._rebalance_operation(self.rebalance_operation)
            self.task.jython_task_manager.get_task_result(rebalance)
            self.assertTrue(rebalance.result, self.rebalance_failed_msg)
        except Exception as e:
            self.log.info("Rebalance failed with: {0}".format(str(e)))
            self.cbcollect_info(trigger=True,
                                validate=False,
                                known_failures=self.cb_collect_failure_nodes)
            if self.auto_failover_timeout < self.afterTimePeriod:
                self.sleep(self.auto_failover_timeout)
                result = json.loads(self.rest.get_pending_rebalance_info())
                self.log.info(result)
                retry_rebalance = result["retry_rebalance"]
                if retry_rebalance != "not_pending":
                    # Wait for cbcollect to complete before asserting
                    self.cbcollect_info(
                        trigger=False,
                        validate=True,
                        known_failures=self.cb_collect_failure_nodes)
                    self.fail("Auto-failover did not cancel pending retry "
                              "of the failed rebalance")
            else:
                try:
                    self.check_retry_rebalance_succeeded()
                except Exception as e:
                    expected_msg = "Retrying of rebalance still did not help"
                    if expected_msg not in str(e):
                        self.fail("Retry rebalance succeeded "
                                  "even without failover")
                    self.sleep(self.auto_failover_timeout)
                    self.cluster.rebalance(self.servers[:self.nodes_init], [],
                                           [])
                finally:
                    self.cbcollect_info(
                        trigger=False,
                        validate=True,
                        known_failures=self.cb_collect_failure_nodes)
        else:
            self.fail("Rebalance did not fail as expected. "
                      "Hence could not validate auto-retry feature..")
        finally:
            if self.disable_auto_failover:
                self.rest.update_autofailover_settings(True, 120)
            self.cluster_util.start_server(self.cluster, self.servers[1])
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def test_cbcollect_with_rebalance_delay_condition(self):
        test_failure_condition = self.input.param("test_failure_condition")
        vb_num = self.input.param("target_vb")
        delay_milliseconds = self.input.param("delay_time", 60) * 1000
        # induce the failure before the rebalance starts
        self._induce_rebalance_test_condition(test_failure_condition,
                                              vb_num=vb_num,
                                              delay_time=delay_milliseconds)
        self.sleep(self.sleep_time,
                   "Wait for rebalance_test_condition to take effect")
        rebalance = self._rebalance_operation(self.rebalance_operation)
        # Start and validate cbcollect with rebalance delay
        self.cbcollect_info(trigger=True, validate=True)
        self.task.jython_task_manager.get_task_result(rebalance)
        if self.disable_auto_failover:
            self.rest.update_autofailover_settings(True, 120)
        self._delete_rebalance_test_condition(test_failure_condition)
        if rebalance.result is False:
            self.fail("Rebalance failed with test_condition: %s" %
                      test_failure_condition)

    def _rebalance_operation(self, rebalance_operation):
        operation = None
        self.log.info("Starting rebalance operation of type: %s" %
                      rebalance_operation)
        if rebalance_operation == "rebalance_out":
            operation = self.task.async_rebalance(
                self.cluster, [],
                self.cluster.servers[1:],
                retry_get_process_num=self.retry_get_process_num)
            self.__update_cbcollect_expected_node_failures(
                self.cluster.servers[1:], "out_node")
        elif rebalance_operation == "rebalance_in":
            operation = self.task.async_rebalance(
                self.cluster, [self.cluster.servers[self.nodes_init]], [],
                retry_get_process_num=self.retry_get_process_num)
            self.__update_cbcollect_expected_node_failures(
                [self.cluster.servers[self.nodes_init]], "in_node")
        elif rebalance_operation == "swap_rebalance":
            self.rest.add_node(self.cluster.master.rest_username,
                               self.cluster.master.rest_password,
                               self.cluster.servers[self.nodes_init].ip,
                               self.cluster.servers[self.nodes_init].port)
            operation = self.task.async_rebalance(
                self.cluster, [], [self.cluster.servers[self.nodes_init - 1]],
                retry_get_process_num=self.retry_get_process_num)
            self.__update_cbcollect_expected_node_failures(
                [self.cluster.servers[self.nodes_init]], "in_node")
            self.__update_cbcollect_expected_node_failures(
                [self.cluster.servers[self.nodes_init - 1]], "out_node")
        elif rebalance_operation == "graceful_failover":
            # TODO : retry for graceful failover is not yet implemented
            operation = self.task.async_failover(
                [self.cluster.master],
                failover_nodes=[self.cluster.servers[1]],
                graceful=True,
                wait_for_pending=300)
        return operation

    def _induce_error(self, error_condition):
        cb_collect_err_str = None
        if error_condition == "stop_server":
            cb_collect_err_str = "failed"
            self.cluster_util.stop_server(self.cluster, self.servers[1])
        elif error_condition == "enable_firewall":
            cb_collect_err_str = "failed"
            self.cluster_util.start_firewall_on_node(self.cluster,
                                                     self.servers[1])
        elif error_condition == "kill_memcached":
            self.cluster_util.kill_memcached(self.cluster,
                                             node=self.servers[1])
        elif error_condition == "reboot_server":
            cb_collect_err_str = "failed"
            shell = RemoteMachineShellConnection(self.servers[1])
            shell.reboot_node()
            shell.disconnect()
        elif error_condition == "kill_erlang":
            cb_collect_err_str = "failed"
            shell = RemoteMachineShellConnection(self.servers[1])
            shell.kill_erlang()
            shell.disconnect()
            self.sleep(self.sleep_time * 3)
        else:
            self.fail("Invalid error induce option")

        if cb_collect_err_str:
            self.__update_cbcollect_expected_node_failures([self.servers[1]],
                                                           cb_collect_err_str)

    def _recover_from_error(self, error_condition):
        if error_condition == "stop_server" \
                or error_condition == "kill_erlang":
            self.cluster_util.start_server(self.cluster, self.servers[1])
        elif error_condition == "enable_firewall":
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])
        elif error_condition == "reboot_server":
            self.sleep(self.sleep_time * 4)
            self.cluster_util.stop_firewall_on_node(self.cluster,
                                                    self.servers[1])

    def _induce_rebalance_test_condition(self,
                                         test_failure_condition,
                                         bucket_name="default",
                                         vb_num=1,
                                         delay_time=60000):
        if test_failure_condition == "verify_replication":
            set_command = 'testconditions:set(verify_replication, ' \
                          '{fail, "%s"})' % bucket_name
        elif test_failure_condition == "backfill_done":
            set_command = 'testconditions:set(backfill_done, ' \
                          '{for_vb_move, "%s", %s , fail})' \
                          % (bucket_name, vb_num)
        elif test_failure_condition == "delay_rebalance_start":
            set_command = 'testconditions:set(rebalance_start, {delay, %s}).' \
                          % delay_time
        elif test_failure_condition == "delay_verify_replication":
            set_command = 'testconditions:set(verify_replication, ' \
                          '{delay, "%s", %s})' % (bucket_name, delay_time)
        elif test_failure_condition == "delay_backfill_done":
            set_command = 'testconditions:set(backfill_done, ' \
                          '{for_vb_move, "%s", %s, {delay, %s}})' \
                          % (bucket_name, vb_num, delay_time)
        else:
            set_command = "testconditions:set(%s, fail)" \
                          % test_failure_condition
        get_command = "testconditions:get(%s)" % test_failure_condition
        for server in self.servers:
            rest = RestConnection(server)
            shell = RemoteMachineShellConnection(server)
            shell.enable_diag_eval_on_non_local_hosts()
            _, content = rest.diag_eval(set_command)
            self.log.debug("Set Command: %s. Return: %s" %
                           (set_command, content))
            shell.disconnect()

        for server in self.servers:
            rest = RestConnection(server)
            shell = RemoteMachineShellConnection(server)
            shell.enable_diag_eval_on_non_local_hosts()
            _, content = rest.diag_eval(get_command)
            self.log.info("Command: %s, Return: %s" % (get_command, content))

    def _delete_rebalance_test_condition(self, test_failure_condition):
        if test_failure_condition.startswith("delay_"):
            test_failure_condition = test_failure_condition[6:]

        delete_command = "testconditions:delete(%s)" % test_failure_condition
        get_command = "testconditions:get(%s)" % test_failure_condition
        for server in self.servers:
            rest = RestConnection(server)
            shell = RemoteMachineShellConnection(server)
            shell.enable_diag_eval_on_non_local_hosts()
            _, content = rest.diag_eval(delete_command)
            self.log.info("Command: %s, Return: %s" %
                          (delete_command, content))

        for server in self.servers:
            rest = RestConnection(server)
            shell = RemoteMachineShellConnection(server)
            shell.enable_diag_eval_on_non_local_hosts()
            _, content = rest.diag_eval(get_command)
            self.log.info("Command: %s, Return: %s" % (get_command, content))