def _common_test_body(self, moxi=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings bucket_data = RebalanceBaseTest.bucket_data_init(rest) for server in self.servers[1:]: self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) self.log.info("adding node {0}:{1} and rebalance afterwards".format(server.ip, server.port)) otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster {1}" self.assertTrue(otpNode, msg.format(server.ip, master.ip)) for name in bucket_data: inserted_keys, rejected_keys = \ MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.servers[0]], name=name, ram_load_ratio= -1, number_of_items=self.keys_count, number_of_threads=1, write_only=True) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(server.ip)) self.log.info("completed rebalancing in server {0}".format(server)) IncrementalRebalanceWithParallelReadTests._reader_thread(self, inserted_keys, bucket_data, moxi=moxi) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(server.ip)) break
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) self.assertTrue(rest.monitorRebalance(), msg="failed after failover of {0}".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings rebalanced_servers = [master] bucket_data = RebalanceBaseTest.bucket_data_init(rest) self.log.info("INTIAL LOAD") RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio, keys_count=self.keys_count) for name in bucket_data: for thread in bucket_data[name]["threads"]: bucket_data[name]["items_inserted_count"] += thread.inserted_keys_count() for server in self.servers[1:]: self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) #do this 2 times , start rebalance , failover the node , remove the node and rebalance for i in range(0, self.num_rebalance): distribution = RebalanceBaseTest.get_distribution(self.load_ratio) RebalanceBaseTest.load_data_for_buckets(rest, self.load_ratio, distribution, [master], bucket_data, self) self.log.info("adding node {0} and rebalance afterwards".format(server.ip)) otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster {1}" self.assertTrue(otpNode, msg.format(server.ip, master.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(server.ip)) rebalanced_servers.append(server) RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True) rest.fail_over(otpNode.id) self.log.info("failed over {0}".format(otpNode.id)) time.sleep(10) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[otpNode.id]) msg = "rebalance failed while removing failover nodes {0}".format(otpNode.id) self.assertTrue(rest.monitorRebalance(), msg=msg) #now verify the numbers again ? RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True) #wait 6 minutes time.sleep(6 * 60) self.log.info("adding node {0} and rebalance afterwards".format(server.ip)) otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster {1}" self.assertTrue(otpNode, msg.format(server.ip, master.ip)) distribution = RebalanceBaseTest.get_distribution(self.load_ratio) RebalanceBaseTest.load_data_for_buckets(rest, self.load_ratio, distribution, rebalanced_servers, bucket_data, self) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(server.ip)) rebalanced_servers.append(server) RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self, True)
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) bucket_data = RebalanceBaseTest.bucket_data_init(rest) self.log.info("INTIAL LOAD") RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio, keys_count=self.keys_count) rebalance_out = False for server in self.servers[1:]: if rebalance_out: # Pick a node to rebalance out, other than master ejectedNodes = [RebalanceHelper.pick_node(master)] else: ejectedNodes = [] current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) self.log.info("adding node {0}, removing node {1} and rebalance afterwards".format(server.ip, [node.ip for node in ejectedNodes])) self.log.info("START PARALLEL LOAD") RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data, DELETE_RATIO=self.delete_ratio, ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio) self.log.info("INCREMENTAL REBALANCE IN/OUT") # rebalance in/out a server RebalanceTaskHelper.add_rebalance_task(self.task_manager, [master], [server], ejectedNodes, do_stop=self.do_stop) # wait for loading tasks to finish RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data) # Make sure we have at least 3 nodes, for replica=2 if len(current_nodes) > 2: rebalance_out = True if self.do_verify: self.log.info("VERIFICATION") RebalanceBaseTest.do_kv_and_replica_verification(master, self.task_manager, bucket_data, self.replica, self) else: self.log.info("NO VERIFICATION")
def test_failover_source_sets(self): replication_type = "continuous" self.log.info("Force initial rebalance.") # This test starts with a 2-2 unidirectional replication from cluster a # to cluster b; during the replication, we trigger failover of one node # on source cluster , resulting a 1-2 replication. # After all loading finish, verify data and rev on both clusters. replication_type = "continuous" self.log.info("Force initial rebalance.") cluster_ref_a = "cluster_ref_a" master_a = self._input.clusters.get(0)[0] rest_conn_a = RestConnection(master_a) cluster_ref_b = "cluster_ref_b" master_b = self._input.clusters.get(1)[0] rest_conn_b = RestConnection(master_b) self.log.info("START XDC replication...") # Start replication rest_conn_a.add_remote_cluster(master_b.ip, master_b.port, master_b.rest_username, master_b.rest_password, cluster_ref_b) (rep_database, rep_id) = rest_conn_a.start_replication(replication_type, self._buckets[0], cluster_ref_b) self._state.append((rest_conn_a, cluster_ref_b, rep_database, rep_id)) # Start load self.log.info("START loading data...") load_thread_list = [] kvstore = ClientKeyValueStore() self._params["ops"] = "set" task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread.start() # sleep a while to allow more data loaded time.sleep(5) self.log.info("current nodes on source cluster: {0}".format(RebalanceHelper.getOtpNodeIds(master_a))) # Trigger failover, we fail over one node each time until there is only one node remaining self.log.info("DURING replication, start failover...") self.log.info("FAILOVER nodes on Cluster A ...") nodes_a = rest_conn_a.node_statuses() while len(nodes_a) > 1: toBeFailedOverNode = RebalanceHelper.pick_node(master_a) self.log.info("failover node {0}".format(toBeFailedOverNode.id)) rest_conn_a.fail_over(toBeFailedOverNode) self.log.info("rebalance after failover") rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()], \ ejectedNodes=[toBeFailedOverNode.id]) self.assertTrue(rest_conn_a.monitorRebalance(), msg="rebalance operation failed after removing node {0}".format(toBeFailedOverNode.id)) nodes_a = rest_conn_a.node_statuses() self.log.info("ALL failed over done...") # Wait for loading threads to finish for lt in load_thread_list: lt.join() self.log.info("All loading threads finished") # Verify replication self.log.info("START data verification at cluster A...") self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_a, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.log.info("START data verification at cluster B...") self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.log.info("START revision verification on both clusters...") self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a, rest_conn_b, self._buckets[0], self._poll_sleep, self._poll_timeout), "Verification of replicated revisions failed")
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached == 100 and not RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') self.log.info("Latest logs from UI:") for i in rest.get_logs(): self.log.error(i) self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node %s not failed over" % server.ip) if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in [node.ip for node in nodes]: if isinstance(server, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if server.ip not in add_back_servers: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(add_back_servers)) SwapRebalanceBase.verification_phase(self, master)
def test_rebalance_inout_with_durability_failure(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add nodes using 'self.nodes_in' such that, replica_number > nodes_in_cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance 5. Add make a node and do CRUD on the bucket 6. Verify durability works after node addition Note: This is a Negative case. i.e: Durability will be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # TODO: Enable verification """ vbucket_info_dict = dict() # Cb stat object for verification purpose master_shell_conn = RemoteMachineShellConnection(master) master_node_cb_stat = Cbstats(master_shell_conn) # Update each vbucket's seq_no for latest value for verification for vb_num in range(0, self.vbuckets): vbucket_info_dict[vb_num] = master_node_cb_stat.vbucket_seqno( def_bucket.name, vb_num, "abs_high_seqno") """ # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # TODO: There will be failure in doc_count verification due to # swap_rebalance. Need to update verification steps accordingly to # satisfy this self.verification_phase() # Add back first ejected node back into the cluster self.task.rebalance(self.cluster.nodes_in_cluster, [toBeEjectedNodes[0]], []) # Load doc into all vbuckets to verify durability gen_create = doc_generator('test_', 0, self.num_items) task = self.task.async_load_gen_docs_atomicity( self.cluster, def_bucket, gen_create, self.op_type, exp=0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit) self.task_manager.get_task_result(task)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error("rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info("Rebalance will be stopped with {0}%".format(progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if self.cluster_run: if server.port not in [node.port for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) else: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) if self.fail_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in nodes: if isinstance(server.ip, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if self.cluster_run: if server.port not in [serv.port for serv in add_back_servers]: final_add_back_servers.append(server) else: if server.ip not in [serv.ip for serv in add_back_servers]: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( add_back_servers)) self.verification_phase()
def test_rebalance_inout_with_durability_check(self): """ Perform irregular number of in_out nodes 1. Swap-out 'self.nodes_out' nodes 2. Add 'self.nodes_in' nodes into the cluster 3. Perform swap-rebalance 4. Make sure durability is not broken due to swap-rebalance Note: This is a Positive case. i.e: Durability should not be broken """ master = self.cluster.master num_initial_servers = self.num_initial_servers creds = self.input.membase_settings def_bucket = self.bucket_util.buckets[0] # Update replica value before performing rebalance in/out if self.replica_to_update: bucket_helper = BucketHelper(self.cluster.master) # Recalculate replicate_to/persist_to as per new replica value if self.self.durability_level is None: self.replicate_to = floor(self.replica_to_update / 2) + 1 self.persist_to = floor(self.replica_to_update / 2) + 2 # Update bucket replica to new value as given in conf file self.log.info("Updating replica count of bucket to {0}".format( self.replica_to_update)) bucket_helper.change_bucket_props( def_bucket.name, replicaNumber=self.replica_to_update) # Rest connection to add/rebalance/monitor nodes rest = RestConnection(master) # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.nodes_out) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.nodes_out is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.nodes_in] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if self.do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.cluster.master rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[1:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status = self.task.rebalance(self.cluster.servers[:self.nodes_init], intial_severs, []) self.assertTrue(status, msg="Rebalance was failed") self.log.info("CREATE BUCKET PHASE") self.create_buckets() self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") self.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 self.sleep(1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) self.verification_phase()
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") self.create_buckets() # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, _ = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = self.start_load_phase() # Wait till load phase is over self.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = self.cluster_util.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format( status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = self.start_access_phase() self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.sleep(10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = self.bucket_util.buckets[0] pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for _ in xrange(times): try: shell = RemoteMachineShellConnection(server) pid = shell.get_memcache_pid() shell.disconnect() break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) self.sleep(2) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") self.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: self.bucket_util._wait_warmup_completed([master], bucket, wait_time=600) # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") self.sleep(30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue( rest.monitorRebalance(), msg="Rebalance failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") self.verification_phase()
def test_incremental_rebalance_out_continuous_bidirectional_sets_deletes(self): cluster_ref_a = "cluster_ref_a" master_a = self._input.clusters.get(0)[0] rest_conn_a = RestConnection(master_a) cluster_ref_b = "cluster_ref_b" master_b = self._input.clusters.get(1)[0] rest_conn_b = RestConnection(master_b) # Setup bi-directional continuous replication replication_type = "continuous" rest_conn_a.add_remote_cluster(master_b.ip, master_b.port, master_b.rest_username, master_b.rest_password, cluster_ref_b) rest_conn_b.add_remote_cluster(master_a.ip, master_a.port, master_a.rest_username, master_a.rest_password, cluster_ref_a) (rep_database_a, rep_id_a) = rest_conn_a.start_replication( replication_type, self._buckets[0], cluster_ref_b) (rep_database_b, rep_id_b) = rest_conn_b.start_replication( replication_type, self._buckets[0], cluster_ref_a) self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a)) self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b)) load_thread_list = [] # Start load kvstore = ClientKeyValueStore() self._params["ops"] = "set" task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread.start() load_thread.join() # Do some deletes self._params["ops"] = "delete" self._params["count"] = self._num_items/5 task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread_list.append(load_thread) # Start all loads concurrently for lt in load_thread_list: lt.start() # Trigger rebalance on both source and destination clusters servers_a = self._input.clusters.get(0) servers_b = self._input.clusters.get(1) rebalanced_servers_a = [] rebalanced_servers_b = [] which_servers_a = [] which_servers_b = [] # Rebalance all the nodes together RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1) RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1) rebalanced_servers_a.extend(servers_a) rebalanced_servers_b.extend(servers_b) nodes_a = rest_conn_a.node_statuses() nodes_b = rest_conn_b.node_statuses() # Incremental rebalance out one node in cluster_a, then cluster_b while len(nodes_a) > 1: toBeEjectedNode = RebalanceHelper.pick_node(master_a) self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master_a))) self.log.info("removing node {0} and rebalance afterwards".format(toBeEjectedNode.id)) rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()], \ ejectedNodes=[toBeEjectedNode.id]) self.assertTrue(rest_conn_a.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNode.id)) while len(nodes_b) > 1: toBeEjectedNode = RebalanceHelper.pick_node(master_b) self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master_b))) self.log.info("removing node {0} and rebalance afterwards".format(toBeEjectedNode.id)) rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\ ejectedNodes=[toBeEjectedNode.id]) self.assertTrue(rest_conn_b.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNode.id)) break for node in nodes_b: for rebalanced_server in rebalanced_servers_b: if rebalanced_server.ip.find(node.ip) != -1: rebalanced_servers_b.remove(rebalanced_server) break nodes_b = rest_conn_a.node_statuses() for node in nodes_a: for rebalanced_server in rebalanced_servers_a: if rebalanced_server.ip.find(node.ip) != -1: rebalanced_servers_a.remove(rebalanced_server) break nodes_a = rest_conn_a.node_statuses() for node in nodes_a: for rebalanced_server in rebalanced_servers_a: if rebalanced_server.ip.find(node.ip) != -1: rebalanced_servers_a.remove(rebalanced_server) break nodes_a= rest_conn_a.node_statuses() # Wait for loading threads to finish for lt in load_thread_list: lt.join() self.log.info("All loading threads finished") # Verify replication self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a, rest_conn_b, self._buckets[0], self._poll_sleep, self._poll_timeout), "Verification of replicated revisions failed")
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs)-1) self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers+self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20*i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) reached = RestHelper(rest).rebalance_reached(expected_progress) command = "[erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)]." memcached_restarted = rest.diag_eval(command) self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval") time.sleep(20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def test_failover_continuous_bidirectional_sets_deletes(self): cluster_ref_a = "cluster_ref_a" master_a = self._input.clusters.get(0)[0] rest_conn_a = RestConnection(master_a) cluster_ref_b = "cluster_ref_b" master_b = self._input.clusters.get(1)[0] rest_conn_b = RestConnection(master_b) # Rebalance all the nodes together servers_a = self._input.clusters.get(0) servers_b = self._input.clusters.get(1) rebalanced_servers_a = [] rebalanced_servers_b = [] RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1) RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1) rebalanced_servers_a.extend(servers_a) rebalanced_servers_b.extend(servers_b) # Setup bi-directional continuous replication replication_type = "continuous" rest_conn_a.add_remote_cluster(master_b.ip, master_b.port, master_b.rest_username, master_b.rest_password, cluster_ref_b) rest_conn_b.add_remote_cluster(master_a.ip, master_a.port, master_a.rest_username, master_a.rest_password, cluster_ref_a) (rep_database_a, rep_id_a) = rest_conn_a.start_replication( replication_type, self._buckets[0], cluster_ref_b) (rep_database_b, rep_id_b) = rest_conn_b.start_replication( replication_type, self._buckets[0], cluster_ref_a) load_thread_list = [] # Start load kvstore = ClientKeyValueStore() self._params["ops"] = "set" task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread.start() load_thread.join() RebalanceHelper.wait_for_persistence(master_a, self._buckets[0]) # Do some deletes self._params["ops"] = "delete" self._params["count"] = self._num_items/5 task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread_list.append(load_thread) # Start all loads concurrently for lt in load_thread_list: lt.start() # Do the failover of nodes on both clusters self.log.info("Failing over nodes") self.log.info("current nodes on cluster 1: {0}".format(RebalanceHelper.getOtpNodeIds(master_a))) self.log.info("current nodes on cluster 2: {0}".format(RebalanceHelper.getOtpNodeIds(master_b))) # Find nodes to be failed_over toBeEjectedNodes = RebalanceHelper.pick_nodes(master_a, howmany=self._failover_factor) optNodesIds_a = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_a: status, content = ClusterOperationHelper.find_orchestrator(master_a) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_a[0] = content master_a = self._input.clusters.get(0)[-1] rest_conn_a = RestConnection(master_a) #Failover selected nodes for node in optNodesIds_a: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_a.fail_over(node) toBeEjectedNodes = RebalanceHelper.pick_nodes(master_b, howmany=self._failover_factor) optNodesIds_b = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_b: status, content = ClusterOperationHelper.find_orchestrator(master_b) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_b[0] = content master_b = self._input.clusters.get(1)[-1] rest_conn_b = RestConnection(master_b) self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a)) self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b)) #Failover selected nodes for node in optNodesIds_b: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_b.fail_over(node) rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()],\ ejectedNodes=optNodesIds_a) rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\ ejectedNodes=optNodesIds_b) self.assertTrue(rest_conn_a.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 1") self.assertTrue(rest_conn_b.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 2") # Wait for loading threads to finish for lt in load_thread_list: lt.join() self.log.info("All loading threads finished") # Verify replication self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a, rest_conn_b, self._buckets[0], self._poll_sleep, self._poll_timeout), "Verification of replicated revisions failed")
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion self.log.info("STOP/START SWAP REBALANCE PHASE") retry = 0 for expected_progress in (20, 40, 60): while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance is already reached") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") time.sleep(20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 time.sleep(1) #self.assertTrue(reached, "rebalance failed or did not reach {0}%".format(expected_progress)) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) #RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) # start load, max_ops_per_second is the combined limit for all buckets buckets = rest.get_buckets() loaders = [] self.log.info("max-ops-per-second per bucket: {0}".format(self.max_ops_per_second / len(buckets))) for bucket in buckets: loader = {} loader["mcsoda"] = LoadWithMcsoda(master, self.keys_count, prefix='', bucket=bucket.name, password=bucket.saslPassword, protocol='membase-binary') loader["mcsoda"].cfg["max-ops"] = 0 loader["mcsoda"].cfg["max-ops-per-sec"] = self.max_ops_per_second / len(buckets) loader["mcsoda"].cfg["exit-after-creates"] = 0 loader["mcsoda"].cfg["min-value-size"] = self.min_item_size loader["mcsoda"].cfg["json"] = 0 loader["mcsoda"].cfg["batch"] = 100 loader["thread"] = Thread(target=loader["mcsoda"].load_data, name='mcloader_' + bucket.name) loader["thread"].daemon = True loaders.append(loader) for loader in loaders: loader["thread"].start() for iteration in range(self.repeat): for server in self.servers[1:]: self.log.info("iteration {0}: ".format(iteration)) self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) self.log.info("adding node {0} and rebalance afterwards".format(server.ip)) rebalance_done = False rebalance_try = 0 while not rebalance_done: try: ClusterOperationHelper.begin_rebalance_in(master, [server]) ClusterOperationHelper.end_rebalance(master) rebalance_done = True except AssertionError as e: rebalance_try += 1 self.log.error(e) time.sleep(5) if rebalance_try > 5: raise e for server in self.servers[1:]: self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) self.log.info("removing node {0} and rebalance afterwards".format(server.ip)) rebalance_done = False rebalance_try = 0 while not rebalance_done: try: ClusterOperationHelper.begin_rebalance_out(master, [server]) ClusterOperationHelper.end_rebalance(master) rebalance_done = True except AssertionError as e: rebalance_try += 1 self.log.error(e) time.sleep(5) if rebalance_try > 5: raise e # stop load for loader in loaders: loader["mcsoda"].load_stop() for loader in loaders: loader["thread"].join()
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info( "FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = StatsCommon.get_stats([master], bucket, "", "pid")[master] command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format( master.ip, master.port, killed)) BaseTestCase._wait_warmup_completed(self, [master], bucket, wait_time=600) time.sleep(5) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) time.sleep(1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") time.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 #we expect that rebalance will be failed while rest._rebalance_progress_status() == "running" and i < 60: self.log.info("rebalance progress: {0}".format(rest._rebalance_progress())) time.sleep(1) i += 1 self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status())) knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) SwapRebalanceBase.verification_phase(self, master)