def rebalance_in_with_failover_full_addback_recovery(self): gen_update = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) tasks = [] tasks += self._async_load_all_buckets(self.master, gen_update, "update", 0) for task in tasks: task.result() servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)] self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full") rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], servs_in, []) rebalance.result() self._verify_stats_all_buckets(self.servers[:self.nodes_in + self.nodes_init], timeout=120) self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init], check_ep_items_remaining = True) self.compare_failovers_logs(prev_failover_stats, self.servers[:self.nodes_in + self.nodes_init], self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, self.servers[:self.nodes_in + self.nodes_init], self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 1.0 , total_vbuckets = self.total_vbuckets)
def rebalance_in_out_with_failover(self): fail_over = self.input.param("fail_over", False) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + 1] servs_out = self.servers[self.nodes_init - 1:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) ejectedNode = self.find_node_info(self.master,self.servers[self.nodes_init-1]) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) result_nodes = self.add_remove_servers(self.servers,self.servers[:self.nodes_init],[self.servers[self.nodes_init-1],chosen[0]],[self.servers[self.nodes_init]]) self.rest.add_node(self.master.rest_username, self.master.rest_password,self.servers[self.nodes_init].ip,self.servers[self.nodes_init].port) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id,ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self._wait_for_stats_all_buckets(result_nodes) self.sleep(10) self.verify_cluster_stats(result_nodes) self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, std = 1.0 , total_vbuckets = self.total_vbuckets)
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def test_rebalance_in_out_with_failover_addback_recovery(self): """ Rebalances nodes out and in with failover and full/delta recovery add back of a node Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. This test begins by loading a given number of items into the cluster. It then removes one node, rebalances that node out the cluster, and then rebalances it back in. During the rebalancing we update all of the items in the cluster. Once the node has been removed and added back we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We then remove and add back two nodes at a time and so on until we have reached the point where we are adding back and removing at least half of the nodes. """ recovery_type = self.input.param("recoveryType", "full") gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) result_nodes = list(set(self.servers[:self.nodes_init] + servs_in) - set(servs_out)) for node in servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) # Mark Node for failover self.sleep(30) success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType=recovery_type) self.sleep(30) self.shuffle_nodes_between_zones_and_rebalance(servs_out) self._verify_stats_all_buckets(result_nodes, timeout=120) self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True) self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
def test_start_stop_rebalance_after_failover(self): """ Rebalances nodes out and in with failover Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. The test begin with loading the bucket with given number of items. It then fails over a node. We then rebalance the cluster, while adding or removing given number of nodes. Once the rebalance reaches 50%, we stop the rebalance and validate the cluster stats. We then restart the rebalance and validate rebalance was completed successfully. """ fail_over = self.input.param("fail_over", False) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) result_nodes = list(set(self.servers[:self.nodes_init] + self.servs_in) - set(self.servs_out)) for node in self.servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], self.servs_in, self.servs_out) expected_progress = 50 rest = RestConnection(self.master) reached = RestHelper(rest).rebalance_reached(expected_progress) self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(expected_progress)) if not RestHelper(rest).is_cluster_rebalanced(): self.log.info("Stop the rebalance") stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="Unable to stop rebalance") self._verify_all_buckets(self.master, timeout=None, max_verify=self.max_verify, batch_size=1) self.shuffle_nodes_between_zones_and_rebalance() self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True, check_bucket_stats=False) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
def rebalance_out_with_failover(self): fail_over = self.input.param("fail_over", False) self.rest = RestConnection(self.master) gen_delete = BlobGenerator("mike", "mike-", self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator( "mike", "mike-", self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2 ) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if self.doc_ops is not None: if "update" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if "create" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if "delete" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() ejectedNode = self.find_node_info(self.master, self.servers[self.nodes_init - 1]) self._verify_stats_all_buckets(self.servers[: self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[: self.num_servers]) self.sleep(20) prev_failover_stats = self.get_failovers_logs(self.servers[: self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[: self.nodes_init], self.buckets) record_data_set = self.get_data_set_all(self.servers[: self.nodes_init], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) new_server_list = self.add_remove_servers( self.servers, self.servers[: self.nodes_init], [self.servers[self.nodes_init - 1], chosen[0]], [] ) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.sleep(20) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self._wait_for_stats_all_buckets(new_server_list) self.verify_cluster_stats(new_server_list) self.data_analysis_all(record_data_set, new_server_list, self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def rebalance_out_with_failover_full_addback_recovery(self): gen_delete = BlobGenerator("mike", "mike-", self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator( "mike", "mike-", self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2 ) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if self.doc_ops is not None: if "update" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if "create" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if "delete" in self.doc_ops: tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() servs_out = [self.servers[self.num_servers - i - 1] for i in range(self.nodes_out)] self._verify_stats_all_buckets(self.servers[: self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[: self.num_servers]) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) self.sleep(20) prev_failover_stats = self.get_failovers_logs(self.servers[: self.num_servers], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[: self.num_servers], self.buckets) record_data_set = self.get_data_set_all(self.servers[: self.num_servers], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full") rebalance = self.cluster.async_rebalance(self.servers[:1], [], servs_out) rebalance.result() self._wait_for_stats_all_buckets(self.servers[: self.num_servers - self.nodes_out]) self.verify_cluster_stats(self.servers[: self.num_servers - self.nodes_out]) self.compare_failovers_logs( prev_failover_stats, self.servers[: self.num_servers - self.nodes_out], self.buckets ) self.data_analysis_all(record_data_set, self.servers[: self.num_servers - self.nodes_out], self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def rebalance_in_with_failover(self): fail_over = self.input.param("fail_over", False) gen_update = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) tasks = [] tasks += self._async_load_all_buckets(self.master, gen_update, "update", 0) for task in tasks: task.result() servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)] self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_failover_stats = self.get_failovers_logs(self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos(self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(self.servers[:self.nodes_init], self.buckets, path=None) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) self.rest = RestConnection(self.master) self.rest.add_node(self.master.rest_username, self.master.rest_password,self.servers[self.nodes_init].ip,self.servers[self.nodes_init].port) # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) if fail_over: self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Graceful Failover Failed") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[chosen[0].id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance Failed") # Verification new_server_list = self.add_remove_servers(self.servers,self.servers[:self.nodes_init],[chosen[0]],[self.servers[self.nodes_init]]) self._wait_for_stats_all_buckets(new_server_list) self._verify_stats_all_buckets(new_server_list, timeout=120) self.verify_cluster_stats(new_server_list) self.compare_failovers_logs(prev_failover_stats, new_server_list, self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, new_server_list, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 1.0 , total_vbuckets = self.total_vbuckets)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs)-1) self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers+self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20*i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) reached = RestHelper(rest).rebalance_reached(expected_progress) command = "[erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)]." memcached_restarted = rest.diag_eval(command) self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval") time.sleep(20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: # let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) rest = RestConnection(self.master) try: msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master) finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090"))
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format( RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node %s not failed over" % server.ip) if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in [node.ip for node in nodes]: if isinstance(server, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if server.ip not in add_back_servers: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(add_back_servers)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error("rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info("Rebalance will be stopped with {0}%".format(progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def test_failover_continuous_bidirectional_sets_deletes(self): cluster_ref_a = "cluster_ref_a" master_a = self._input.clusters.get(0)[0] rest_conn_a = RestConnection(master_a) cluster_ref_b = "cluster_ref_b" master_b = self._input.clusters.get(1)[0] rest_conn_b = RestConnection(master_b) # Rebalance all the nodes together servers_a = self._input.clusters.get(0) servers_b = self._input.clusters.get(1) rebalanced_servers_a = [] rebalanced_servers_b = [] RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1) RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1) rebalanced_servers_a.extend(servers_a) rebalanced_servers_b.extend(servers_b) # Setup bi-directional continuous replication replication_type = "continuous" rest_conn_a.add_remote_cluster(master_b.ip, master_b.port, master_b.rest_username, master_b.rest_password, cluster_ref_b) rest_conn_b.add_remote_cluster(master_a.ip, master_a.port, master_a.rest_username, master_a.rest_password, cluster_ref_a) (rep_database_a, rep_id_a) = rest_conn_a.start_replication( replication_type, self._buckets[0], cluster_ref_b) (rep_database_b, rep_id_b) = rest_conn_b.start_replication( replication_type, self._buckets[0], cluster_ref_a) load_thread_list = [] # Start load kvstore = ClientKeyValueStore() self._params["ops"] = "set" task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread.start() load_thread.join() RebalanceHelper.wait_for_persistence(master_a, self._buckets[0]) # Do some deletes self._params["ops"] = "delete" self._params["count"] = self._num_items/5 task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread_list.append(load_thread) # Start all loads concurrently for lt in load_thread_list: lt.start() # Do the failover of nodes on both clusters self.log.info("Failing over nodes") self.log.info("current nodes on cluster 1: {0}".format(RebalanceHelper.getOtpNodeIds(master_a))) self.log.info("current nodes on cluster 2: {0}".format(RebalanceHelper.getOtpNodeIds(master_b))) # Find nodes to be failed_over toBeEjectedNodes = RebalanceHelper.pick_nodes(master_a, howmany=self._failover_factor) optNodesIds_a = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_a: status, content = ClusterOperationHelper.find_orchestrator(master_a) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_a[0] = content master_a = self._input.clusters.get(0)[-1] rest_conn_a = RestConnection(master_a) #Failover selected nodes for node in optNodesIds_a: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_a.fail_over(node) toBeEjectedNodes = RebalanceHelper.pick_nodes(master_b, howmany=self._failover_factor) optNodesIds_b = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_b: status, content = ClusterOperationHelper.find_orchestrator(master_b) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_b[0] = content master_b = self._input.clusters.get(1)[-1] rest_conn_b = RestConnection(master_b) self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a)) self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b)) #Failover selected nodes for node in optNodesIds_b: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_b.fail_over(node) rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()],\ ejectedNodes=optNodesIds_a) rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\ ejectedNodes=optNodesIds_b) self.assertTrue(rest_conn_a.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 1") self.assertTrue(rest_conn_b.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 2") # Wait for loading threads to finish for lt in load_thread_list: lt.join() self.log.info("All loading threads finished") # Verify replication self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a, rest_conn_b, self._buckets[0], self._poll_sleep, self._poll_timeout), "Verification of replicated revisions failed")
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("30 seconds sleep after failover before invoking rebalance...") time.sleep(30) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if self.cluster_run: if server.port not in [node.port for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) else: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in nodes: if isinstance(server.ip, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if self.cluster_run: if server.port not in [serv.port for serv in add_back_servers]: final_add_back_servers.append(server) else: if server.ip not in [serv.ip for serv in add_back_servers]: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( add_back_servers)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command( "ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for i in xrange(times): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 2) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def rebalance_out_with_failover(self): fail_over = self.input.param("fail_over", False) self.rest = RestConnection(self.master) gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() ejectedNode = self.find_node_info(self.master, self.servers[self.nodes_init - 1]) self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) record_data_set = self.get_data_set_all(self.servers[:self.nodes_init], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) new_server_list = self.add_remove_servers( self.servers, self.servers[:self.nodes_init], [self.servers[self.nodes_init - 1], chosen[0]], []) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=fail_over) self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[chosen[0].id, ejectedNode.id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") self.verify_cluster_stats(new_server_list, check_ep_items_remaining=True) self.sleep(30) self.data_analysis_all(record_data_set, new_server_list, self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set ={} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations relalted to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: return if self.during_ops == None: self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master)
def common_test_body(self, keys_count, replica, load_ratio, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replica : {0}".format(replica)) log.info("load_ratio : {0}".format(load_ratio)) log.info("failover_reason : {0}".format(failover_reason)) master = self._servers[0] log.info('picking server : {0} as the master'.format(master)) rest = RestConnection(master) info = rest.get_nodes_self() rest.init_cluster(username=master.rest_username, password=master.rest_password) rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved) bucket_ram = info.memoryQuota * 2 / 3 bucket = 'default' rest.create_bucket(bucket=bucket, ramQuotaMB=bucket_ram, replicaNumber=replica, proxyPort=info.moxi) ready = BucketOperationHelper.wait_for_memcached(master, bucket) self.assertTrue(ready, "wait_for_memcached_failed") credentials = self._input.membase_settings ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers, credentials, self) nodes = rest.node_statuses() rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed after adding these nodes {0}".format(nodes) self.assertTrue(rest.monitorRebalance(), msg=msg) inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count, load_ratio) inserted_count = len(inserted_keys) log.info('inserted {0} keys'.format(inserted_count)) nodes = rest.node_statuses() while (len(nodes) - replica) > 1: final_replication_state = RestHelper(rest).wait_for_replication(900) msg = "replication state after waiting for up to 15 minutes : {0}" self.log.info(msg.format(final_replication_state)) chosen = RebalanceHelper.pick_nodes(master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall(self._servers, node, bidirectional=self.bidirectional) self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) #REMOVEME - log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(), msg=msg) FailoverBaseTest.replication_verification(master, bucket, replica, inserted_count, self) nodes = rest.node_statuses() FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached == 100 and not RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') self.log.info("Latest logs from UI:") for i in rest.get_logs(): self.log.error(i) self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def test_rebalance_in_out_with_failover(self): """ Rebalances nodes out and in with failover Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. This test begins by loading a given number of items into the cluster. It then removes one node, rebalances that node out the cluster, and then rebalances it back in. During the rebalancing we update all of the items in the cluster. Once the node has been removed and added back we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We then remove and add back two nodes at a time and so on until we have reached the point where we are adding back and removing at least half of the nodes. """ fail_over = self.input.param("fail_over", False) gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) if self.flusher_total_batch_limit is not None: # Validate seq_no snap_start/stop values after initial doc_load self.check_snap_start_corruption() self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) if self.flusher_total_batch_limit is None: self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) result_nodes = list( set(self.servers[:self.nodes_init] + servs_in) - set(servs_out)) for node in servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) if self.flusher_total_batch_limit is not None: # Load data after add-node self._load_all_buckets(self.master, gen, "update", 0) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Mark Node for failover self.rest.fail_over(chosen[0].id, graceful=fail_over) self.wait_for_failover_or_assert(expected_failover_count=1) # Load data after failover self._load_all_buckets(self.master, gen, "update", 0) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # No need to pass self.sleep_before_rebalance, # since prev ops are synchronous call self.shuffle_nodes_between_zones_and_rebalance(servs_out) if self.flusher_total_batch_limit is not None: # Validate seq_no snap_start/stop values after rebalance self.check_snap_start_corruption() self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True) if self.flusher_total_batch_limit is None: self.compare_failovers_logs(prev_failover_stats, result_nodes, self.buckets) self.sleep(30) self.data_analysis_active_replica_all(disk_active_dataset, disk_replica_dataset, result_nodes, self.buckets, path=None) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, std=1.0, total_vbuckets=self.total_vbuckets)
def rebalance_out_with_failover_full_addback_recovery(self): gen_delete = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2, end=self.num_items) gen_create = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items + 1, end=self.num_items * 3 / 2) # define which doc's ops will be performed during rebalancing # allows multiple of them but one by one tasks = [] if (self.doc_ops is not None): if ("update" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if ("create" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_create, "create", 0) if ("delete" in self.doc_ops): tasks += self._async_load_all_buckets(self.master, gen_delete, "delete", 0) for task in tasks: task.result() servs_out = [ self.servers[self.num_servers - i - 1] for i in range(self.nodes_out) ] self._verify_stats_all_buckets(self.servers[:self.num_servers], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.num_servers]) self.rest = RestConnection(self.master) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) self.sleep(20) prev_failover_stats = self.get_failovers_logs( self.servers[:self.num_servers], self.buckets) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.num_servers], self.buckets) record_data_set = self.get_data_set_all( self.servers[:self.num_servers], self.buckets) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType="full") rebalance = self.cluster.async_rebalance(self.servers[:1], [], servs_out) rebalance.result() self.verify_cluster_stats(self.servers[:self.num_servers - self.nodes_out], check_ep_items_remaining=True) self.compare_failovers_logs( prev_failover_stats, self.servers[:self.num_servers - self.nodes_out], self.buckets) self.sleep(30) self.data_analysis_all( record_data_set, self.servers[:self.num_servers - self.nodes_out], self.buckets) self.verify_unacked_bytes_all_buckets() nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=1.0, total_vbuckets=self.total_vbuckets)
def test_rebalance_in_out_with_failover_addback_recovery(self): """ Rebalances nodes out and in with failover and full/delta recovery add back of a node Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone' param to have nodes divided into server groups by having zone > 1. This test begins by loading a given number of items into the cluster. It then removes one node, rebalances that node out the cluster, and then rebalances it back in. During the rebalancing we update all of the items in the cluster. Once the node has been removed and added back we wait for the disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the curr_items_total. We then remove and add back two nodes at a time and so on until we have reached the point where we are adding back and removing at least half of the nodes. """ recovery_type = self.input.param("recoveryType", "full") gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items) self._load_all_buckets(self.master, gen, "create", 0) tasks = self._async_load_all_buckets(self.master, gen, "update", 0) servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in] servs_out = self.servers[self.nodes_init - self.nodes_out:self.nodes_init] for task in tasks: task.result(self.wait_timeout * 20) self._verify_stats_all_buckets(self.servers[:self.nodes_init], timeout=120) self._wait_for_stats_all_buckets(self.servers[:self.nodes_init]) # self.sleep(20) prev_vbucket_stats = self.get_vbucket_seqnos( self.servers[:self.nodes_init], self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers[:self.nodes_init], self.buckets) disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all( self.servers[:self.nodes_init], self.buckets, path=None) self.compare_vbucketseq_failoverlogs(prev_vbucket_stats, prev_failover_stats) self.rest = RestConnection(self.master) self.nodes = self.get_nodes(self.master) result_nodes = list( set(self.servers[:self.nodes_init] + servs_in) - set(servs_out)) for node in servs_in: self.rest.add_node(self.master.rest_username, self.master.rest_password, node.ip, node.port) chosen = RebalanceHelper.pick_nodes(self.master, howmany=1) # Mark Node for failover success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False) self.wait_for_failover_or_assert(expected_failover_count=1) # Mark Node for full recovery if success_failed_over: self.rest.set_recovery_type(otpNode=chosen[0].id, recoveryType=recovery_type) # self.sleep(30) try: self.shuffle_nodes_between_zones_and_rebalance(servs_out) except Exception as e: if "deltaRecoveryNotPossible" not in e.__str__(): self.fail( "Rebalance did not fail. Rebalance has to fail since no delta recovery should be possible" " while adding nodes too")