def test_swap_rebalance_cb_cbas_together(self): self.log.info("Creates cbas buckets and dataset") wait_for_rebalance = self.input.param("wait_for_rebalance", True) dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Add KV node and don't rebalance") self.cluster_util.add_node(node=self.rebalanceServers[1], rebalance=False) self.log.info("Add cbas node and don't rebalance") self.cluster_util.add_node(node=self.rebalanceServers[3], rebalance=False) otpnodes = [] nodes = self.rest.node_statuses() for node in nodes: if node.ip == self.rebalanceServers[0].ip or node.ip == self.rebalanceServers[2].ip: otpnodes.append(node) self.log.info("Remove master node") self.remove_node(otpnode=otpnodes, wait_for_rebalance=wait_for_rebalance) self.cluster.master = self.rebalanceServers[1] self.log.info("Create instances pointing to new master nodes") c_utils = CbasUtil(self.rebalanceServers[1], self.rebalanceServers[3], self.task) c_utils.createConn(self.cb_bucket_name) self.log.info("Create reference to SDK client") client = SDKClient(scheme="couchbase", hosts=[self.rebalanceServers[1].ip], bucket=self.cb_bucket_name, password=self.rebalanceServers[1].rest_password) self.log.info("Add more document to default bucket") documents = ['{"name":"value"}'] * (self.num_items//10) document_id_prefix = "custom-id-" client.insert_custom_json_documents(document_id_prefix, documents) self.log.info( "Run queries as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) handles = c_utils._run_concurrent_queries(dataset_count_query, "immediate", 2000, batch_size=self.concurrent_batch_size) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not c_utils.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items + (self.num_items//10) , 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket")
class IngestionInterrupt_CBAS(CBASBaseTest): def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket": False}) super(IngestionInterrupt_CBAS, self).setUp() if "add_all_cbas_nodes" in self.input.test_params \ and self.input.test_params["add_all_cbas_nodes"] \ and len(self.cluster.cbas_nodes) > 0: self.otpNodes.extend( self.cluster_util.add_all_nodes_then_rebalance( self.cluster, self.cluster.cbas_nodes)) self.bucket_util.create_default_bucket(self.cluster, storage=self.bucket_storage) self.cb_bucket_name = self.input.param('cb_bucket_name', 'default') self.cbas_util.createConn("default") def setup_for_test(self, skip_data_loading=False): if not skip_data_loading: # Load Couchbase bucket first. self.perform_doc_ops_in_all_cb_buckets("create", 0, self.num_items, batch_size=1000) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Create dataset on the CBAS bucket self.cbas_util.create_dataset_on_bucket( cbas_bucket_name=self.cb_bucket_name, cbas_dataset_name=self.cbas_dataset_name) # Create indexes on the CBAS bucket self.create_secondary_indexes = self.input.param( "create_secondary_indexes", False) if self.create_secondary_indexes: self.index_fields = "profession:string,number:bigint" create_idx_statement = "create index {0} on {1}({2});".format( self.index_name, self.cbas_dataset_name, self.index_fields) status, metrics, errors, results, _ = self.cbas_util.execute_statement_on_cbas_util( create_idx_statement) self.assertTrue(status == "success", "Create Index query failed") self.assertTrue( self.cbas_util.verify_index_created( self.index_name, self.index_fields.split(","), self.cbas_dataset_name)[0]) # Connect to Bucket self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) if not skip_data_loading: # Validate no. of items in CBAS dataset if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def ingestion_in_progress(self): self.cbas_util.disconnect_from_bucket(self.cbas_bucket_name) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 3, batch_size=1000) self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) def test_service_restart(self): self.setup_for_test() self.restart_method = self.input.param('restart_method', None) self.cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if self.cbas_node_type == "CC": node_in_test = self.cbas_node else: node_in_test = self.cluster.cbas_nodes[0] items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items before service restart: %s" % items_in_cbas_bucket) if self.restart_method == "graceful": self.log.info("Gracefully re-starting service on node %s" % node_in_test) NodeHelper.do_a_warm_up(node_in_test) NodeHelper.wait_service_started(node_in_test) else: self.log.info("Kill Memcached process on node %s" % node_in_test) shell = RemoteMachineShellConnection(node_in_test) shell.kill_memcached() items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.log.info( "After graceful service restart docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_kill_analytics_service(self): self.setup_for_test() process_name = self.input.param('process_name', None) service_name = self.input.param('service_name', None) cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if cbas_node_type == "CC": node_in_test = self.cbas_node else: node_in_test = self.cluster.cbas_nodes[0] items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items before service kill: %s" % items_in_cbas_bucket) self.log.info("Kill %s process on node %s" % (process_name, node_in_test)) shell = RemoteMachineShellConnection(node_in_test) shell.kill_process(process_name, service_name) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass # start_time = time.time() # while items_in_cbas_bucket <=0 and time.time()<start_time+120: # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.sleep(1) # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) self.log.info("After %s kill, docs in CBAS bucket : %s" % (process_name, items_in_cbas_bucket)) if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did not interrupted but restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if cbas_node_type == "NC": self.assertTrue((fail_count + aborted_count) == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_stop_start_service_ingest_data(self): self.setup_for_test() self.cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") else: node_in_test = self.cluster.cbas_nodes[0] items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Items before service restart: %s" % items_in_cbas_bucket) self.log.info("Gracefully stopping service on node %s" % node_in_test) NodeHelper.stop_couchbase(node_in_test) NodeHelper.start_couchbase(node_in_test) NodeHelper.wait_service_started(node_in_test) # self.sleep(10, "wait for service to come up.") # # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("After graceful STOPPING/STARTING service docs in CBAS bucket : %s"%items_in_cbas_bucket) # # start_time = time.time() # while items_in_cbas_bucket <=0 and time.time()<start_time+60: # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.sleep(1) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did not interrupted but restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_disk_full_ingest_data(self): self.cbas_node_type = self.input.param('cbas_node_type', None) if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) else: node_in_test = self.cluster.cbas_nodes[0] remote_client = RemoteMachineShellConnection(node_in_test) output, error = remote_client.execute_command("rm -rf full_disk*", use_channel=True) remote_client.log_command_output(output, error) self.setup_for_test() query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) def _get_disk_usage_in_MB(remote_client): disk_info = remote_client.get_disk_info(in_MB=True) disk_space = disk_info[1].split()[-3][:-1] return disk_space du = int(_get_disk_usage_in_MB(remote_client)) - 50 chunk_size = 1024 while int(du) > 0: output, error = remote_client.execute_command( "dd if=/dev/zero of=full_disk{0} bs={1}M count=1".format( str(du) + "_MB" + str(time.time()), chunk_size), use_channel=True) remote_client.log_command_output(output, error) du -= 1024 if du < 1024: chunk_size = du self.ingestion_in_progress() items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) try: while items_in_cbas_bucket_before != items_in_cbas_bucket_after: items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.sleep(2) items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: self.log.info("Ingestion interrupted and server seems to be down") if items_in_cbas_bucket_before == self.num_items * 3: self.log.info("Data Ingestion did not interrupted but completed.") elif items_in_cbas_bucket_before < self.num_items * 3: self.log.info("Data Ingestion Interrupted successfully") output, error = remote_client.execute_command("rm -rf full_disk*", use_channel=True) remote_client.log_command_output(output, error) remote_client.disconnect() self.sleep( 10, "wait for service to come up after disk space is made available.") run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") self.sleep(60) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_stop_network_ingest_data(self): self.setup_for_test() self.cbas_node_type = self.input.param('cbas_node_type', None) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() # Add the code for stop network here: if self.cbas_node_type: if self.cbas_node_type == "CC": node_in_test = self.cbas_node self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") else: node_in_test = self.cluster.cbas_nodes[0] # Stop network on KV node to mimic n/w partition on KV else: node_in_test = self.cluster.master items_in_cbas_bucket_before, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) self.log.info("Intems before network down: %s" % items_in_cbas_bucket_before) RemoteMachineShellConnection(node_in_test).stop_network("30") # self.sleep(40, "Wait for network to come up.") items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass # items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) self.log.info("Items after network is up: %s" % items_in_cbas_bucket) # start_time = time.time() # while items_in_cbas_bucket_after <=0 and time.time()<start_time+60: # items_in_cbas_bucket_after, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.sleep(1) # items_in_cbas_bucket = items_in_cbas_bucket_after if items_in_cbas_bucket < self.num_items * 3 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did not interrupted but restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.cbas_node_type == "NC": self.assertTrue(fail_count + aborted_count == 0, "Some queries failed/aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 3): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_network_hardening(self): self.setup_for_test() end = self.num_items CC = self.cbas_node NC = self.cluster.cbas_nodes KV = self.cluster.master nodes = [CC] + NC for node in nodes: for i in xrange(2): NodeHelper.enable_firewall(node) start = end end = start + self.num_items tasks = self.perform_doc_ops_in_all_cb_buckets("create", start, end, batch_size=1000, _async=True) self.sleep( 30, "Sleep after enabling firewall on node %s then disbale it." % node.ip) NodeHelper.disable_firewall(node) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.log.info("Items after network is up: %s" % items_in_cbas_bucket) if items_in_cbas_bucket < end and items_in_cbas_bucket > start: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < start: self.log.info( "Data Ingestion did not interrupted but restarting from 0." ) else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) query = "select count(*) from {0};".format( self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) for task in tasks: self.task_manager.get_task_result(task) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, end): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) NodeHelper.enable_firewall(node, bidirectional=True) start = end end = start + self.num_items tasks = self.perform_doc_ops_in_all_cb_buckets("create", start, end, batch_size=1000, _async=True) self.sleep( 30, "Sleep after enabling firewall on CC node then disbale it." ) NodeHelper.disable_firewall(node) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.log.info("Items after network is up: %s" % items_in_cbas_bucket) if items_in_cbas_bucket < end and items_in_cbas_bucket > start: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < start: self.log.info( "Data Ingestion did not interrupted but restarting from 0." ) else: self.log.info( "Data Ingestion did not interrupted but complete before service restart." ) query = "select count(*) from {0};".format( self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) for task in tasks: self.task_manager.get_task_result(task) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, end): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" )
class UpgradeTests(UpgradeBase): def setUp(self): super(UpgradeTests, self).setUp() self.cbas_util = CbasUtil(self.task) self.cbas_spec_name = self.input.param("cbas_spec", "local_datasets") self.rebalance_util = CBASRebalanceUtil(self.cluster_util, self.bucket_util, self.task, vbucket_check=True, cbas_util=self.cbas_util) if self.input.param("n2n_encryption", False): CbServer.use_https = True trust_all_certs() self.security_util = SecurityUtils(self.log) rest = RestConnection(self.cluster.master) self.log.info("Disabling Auto-Failover") if not rest.update_autofailover_settings(False, 120): self.fail("Disabling Auto-Failover failed") self.log.info("Setting node to node encryption level to all") self.security_util.set_n2n_encryption_level_on_nodes( self.cluster.nodes_in_cluster, level="all") CbServer.use_https = True self.log.info("Enabling Auto-Failover") if not rest.update_autofailover_settings(True, 300): self.fail("Enabling Auto-Failover failed") cbas_cc_node_ip = None retry = 0 self.cluster.cbas_nodes = \ self.cluster_util.get_nodes_from_services_map( self.cluster, service_type="cbas", get_all_nodes=True, servers=self.cluster.nodes_in_cluster) while True and retry < 60: cbas_cc_node_ip = self.cbas_util.retrieve_cc_ip_from_master( self.cluster) if cbas_cc_node_ip: break else: self.sleep(10, "Waiting for CBAS service to come up") retry += 1 if not cbas_cc_node_ip: self.fail("CBAS service did not come up even after 10 " "mins.") for server in self.cluster.cbas_nodes: if server.ip == cbas_cc_node_ip: self.cluster.cbas_cc_node = server break if not self.cbas_util.wait_for_cbas_to_recover(self.cluster, timeout=300): self.fail("Analytics service failed to start post adding cbas " "nodes to cluster") self.pre_upgrade_setup() self.log_setup_status(self.__class__.__name__, "Finished", stage=self.setUp.__name__) def tearDown(self): self.log_setup_status(self.__class__.__name__, "Started", stage=self.tearDown.__name__) self.cluster.master = self.cluster_util.get_kv_nodes(self.cluster)[0] self.cluster_util.cluster_cleanup(self.cluster, self.bucket_util) super(UpgradeTests, self).tearDown() self.log_setup_status(self.__class__.__name__, "Finished", stage=self.tearDown.__name__) def pre_upgrade_setup(self): """ Number of datasets is fixed here, as pre 6.6 default max number of datasets that can be created was 8. """ major_version = float(self.initial_version[:3]) if major_version >= 7.0: update_spec = { "no_of_dataverses": self.input.param('pre_update_no_of_dv', 2), "no_of_datasets_per_dataverse": self.input.param('pre_update_ds_per_dv', 4), "no_of_synonyms": self.input.param('pre_update_no_of_synonym', 0), "no_of_indexes": self.input.param('pre_update_no_of_index', 3), "max_thread_count": self.input.param('no_of_threads', 10), } else: update_spec = { "no_of_dataverses": self.input.param('pre_update_no_of_dv', 2), "no_of_datasets_per_dataverse": self.input.param('pre_update_ds_per_dv', 4), "no_of_synonyms": 0, "no_of_indexes": self.input.param('pre_update_no_of_index', 3), "max_thread_count": self.input.param('no_of_threads', 10), "dataverse": { "cardinality": 1, "creation_method": "dataverse" }, "dataset": { "creation_methods": ["cbas_dataset"], "bucket_cardinality": 1 }, "index": { "creation_method": "index" } } if update_spec["no_of_dataverses"] * update_spec[ "no_of_datasets_per_dataverse"] > 8: self.fail("Total number of datasets across all dataverses " "cannot be more than 8 for pre 7.0 builds") if not self.cbas_setup(update_spec): self.fail("Pre Upgrade CBAS setup failed") if major_version >= 7.1: self.replica_num = self.input.param('replica_num', 0) set_result = self.cbas_util.set_replica_number_from_settings( self.cluster.master, replica_num=self.replica_num) if set_result != self.replica_num: self.fail("Error while setting replica for CBAS") self.log.info( "Rebalancing for CBAS replica setting change to take " "effect.") rebalance_task, _ = self.rebalance_util.rebalance( self.cluster, kv_nodes_in=0, kv_nodes_out=0, cbas_nodes_in=0, cbas_nodes_out=0, available_servers=[], exclude_nodes=[]) if not self.rebalance_util.wait_for_rebalance_task_to_complete( rebalance_task, self.cluster): self.fail("Rebalance failed") def cbas_setup(self, update_spec, connect_local_link=True): if self.cbas_spec_name: self.cbas_spec = self.cbas_util.get_cbas_spec(self.cbas_spec_name) self.cbas_util.update_cbas_spec(self.cbas_spec, update_spec) cbas_infra_result = self.cbas_util.create_cbas_infra_from_spec( self.cluster, self.cbas_spec, self.bucket_util, wait_for_ingestion=False) if not cbas_infra_result[0]: self.log.error( "Error while creating infra from CBAS spec -- {0}".format( cbas_infra_result[1])) return False if connect_local_link: for dataverse in self.cbas_util.dataverses: if not self.cbas_util.connect_link( self.cluster, ".".join([dataverse, "Local"])): self.log.error( "Failed to connect Local link for dataverse - {0}". format(dataverse)) return False if not self.cbas_util.wait_for_ingestion_all_datasets( self.cluster, self.bucket_util): self.log.error("Data ingestion did not happen in the datasets") return False return True def post_upgrade_validation(self): major_version = float(self.upgrade_version[:3]) # rebalance once again to activate CBAS service self.sleep(180, "Sleep before rebalancing to activate CBAS service") rebalance_task, _ = self.rebalance_util.rebalance(self.cluster, kv_nodes_in=0, kv_nodes_out=0, cbas_nodes_in=0, cbas_nodes_out=0, available_servers=[], exclude_nodes=[]) if not self.rebalance_util.wait_for_rebalance_task_to_complete( rebalance_task, self.cluster): self.log_failure("Rebalance failed") return False rest = RestConnection(self.cluster.master) # Update RAM quota allocated to buckets created before upgrade cluster_info = rest.get_nodes_self() kv_quota = \ cluster_info.__getattribute__(CbServer.Settings.KV_MEM_QUOTA) bucket_size = kv_quota // (self.input.param("num_buckets", 1) + 1) for bucket in self.cluster.buckets: self.bucket_util.update_bucket_property(self.cluster.master, bucket, bucket_size) validation_results = {} self.log.info("Validating pre upgrade cbas infra") results = list() for dataverse in self.cbas_util.dataverses: results.append( self.cbas_util.validate_dataverse_in_metadata( self.cluster, dataverse)) for dataset in self.cbas_util.list_all_dataset_objs( dataset_source="internal"): results.append( self.cbas_util.validate_dataset_in_metadata( self.cluster, dataset_name=dataset.name, dataverse_name=dataset.dataverse_name)) results.append( self.cbas_util.validate_cbas_dataset_items_count( self.cluster, dataset_name=dataset.full_name, expected_count=dataset.num_of_items)) for index in self.cbas_util.list_all_index_objs(): result, _ = self.cbas_util.verify_index_created( self.cluster, index_name=index.name, dataset_name=index.dataset_name, indexed_fields=index.indexed_fields) results.append(result) results.append( self.cbas_util.verify_index_used( self.cluster, statement="SELECT VALUE v FROM {0} v WHERE age > 2".format( index.full_dataset_name), index_used=True, index_name=None)) validation_results["pre_upgrade"] = all(results) if major_version >= 7.1: self.log.info("Enabling replica for analytics") self.replica_num = self.input.param('replica_num', 0) set_result = self.cbas_util.set_replica_number_from_settings( self.cluster.master, replica_num=self.replica_num) if set_result != self.replica_num: self.fail("Error while setting replica for CBAS") self.log.info( "Rebalancing for CBAS replica setting change to take " "effect.") rebalance_task, _ = self.rebalance_util.rebalance( self.cluster, kv_nodes_in=0, kv_nodes_out=0, cbas_nodes_in=0, cbas_nodes_out=0, available_servers=[], exclude_nodes=[]) if not self.rebalance_util.wait_for_rebalance_task_to_complete( rebalance_task, self.cluster): self.fail("Rebalance failed") if not self.cbas_util.wait_for_replication_to_finish(self.cluster): self.fail("Replication could not complete before timeout") if not self.cbas_util.verify_actual_number_of_replicas( self.cluster, len(self.cluster.cbas_nodes) - 1): self.fail("Actual number of replicas is different from what " "was set") self.log.info("Loading docs in default collection of existing buckets") for bucket in self.cluster.buckets: gen_load = doc_generator(self.key, self.num_items, self.num_items * 2, randomize_doc_size=True, randomize_value=True, randomize=True) async_load_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, DocLoading.Bucket.DocOps.CREATE, active_resident_threshold=self.active_resident_threshold, timeout_secs=self.sdk_timeout, process_concurrency=8, batch_size=500, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(async_load_task) # Update num_items in case of DGM run if self.active_resident_threshold != 100: self.num_items = async_load_task.doc_index bucket.scopes[CbServer.default_scope].collections[ CbServer.default_collection].num_items = self.num_items * 2 # Verify doc load count self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets) self.sleep(30, "Wait for num_items to get reflected") current_items = self.bucket_util.get_bucket_current_item_count( self.cluster, bucket) if current_items == self.num_items * 2: validation_results["post_upgrade_data_load"] = True else: self.log.error( "Mismatch in doc_count. Actual: %s, Expected: %s" % (current_items, self.num_items * 2)) validation_results["post_upgrade_data_load"] = False self.bucket_util.print_bucket_stats(self.cluster) if not self.cbas_util.wait_for_ingestion_all_datasets( self.cluster, self.bucket_util): validation_results["post_upgrade_data_load"] = False self.log.error("Data ingestion did not happen in the datasets") else: validation_results["post_upgrade_data_load"] = True self.log.info( "Deleting all the data from default collection of buckets created before upgrade" ) for bucket in self.cluster.buckets: gen_load = doc_generator(self.key, 0, self.num_items * 2, randomize_doc_size=True, randomize_value=True, randomize=True) async_load_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, DocLoading.Bucket.DocOps.DELETE, active_resident_threshold=self.active_resident_threshold, timeout_secs=self.sdk_timeout, process_concurrency=8, batch_size=500, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(async_load_task) # Verify doc load count self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets) while True: current_items = self.bucket_util.get_bucket_current_item_count( self.cluster, bucket) if current_items == 0: break else: self.sleep(30, "Wait for num_items to get reflected") bucket.scopes[CbServer.default_scope].collections[ CbServer.default_collection].num_items = 0 if major_version >= 7.0: self.log.info("Creating scopes and collections in existing bucket") scope_spec = {"name": self.cbas_util.generate_name()} self.bucket_util.create_scope_object(self.cluster.buckets[0], scope_spec) collection_spec = { "name": self.cbas_util.generate_name(), "num_items": self.num_items } self.bucket_util.create_collection_object(self.cluster.buckets[0], scope_spec["name"], collection_spec) bucket_helper = BucketHelper(self.cluster.master) status, content = bucket_helper.create_scope( self.cluster.buckets[0].name, scope_spec["name"]) if status is False: self.fail("Create scope failed for %s:%s, Reason - %s" % (self.cluster.buckets[0].name, scope_spec["name"], content)) self.bucket.stats.increment_manifest_uid() status, content = bucket_helper.create_collection( self.cluster.buckets[0].name, scope_spec["name"], collection_spec) if status is False: self.fail( "Create collection failed for %s:%s:%s, Reason - %s" % (self.cluster.buckets[0].name, scope_spec["name"], collection_spec["name"], content)) self.bucket.stats.increment_manifest_uid() self.log.info("Creating new buckets with scopes and collections") for i in range(1, self.input.param("num_buckets", 1) + 1): self.bucket_util.create_default_bucket( self.cluster, replica=self.num_replicas, compression_mode=self.compression_mode, ram_quota=bucket_size, bucket_type=self.bucket_type, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy, bucket_durability=self.bucket_durability_level, bucket_name="bucket_{0}".format(i)) if major_version >= 7.0: self.over_ride_spec_params = self.input.param( "override_spec_params", "").split(";") self.load_data_into_buckets() else: for bucket in self.cluster.buckets[1:]: gen_load = doc_generator(self.key, 0, self.num_items, randomize_doc_size=True, randomize_value=True, randomize=True) async_load_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_load, DocLoading.Bucket.DocOps.CREATE, active_resident_threshold=self.active_resident_threshold, timeout_secs=self.sdk_timeout, process_concurrency=8, batch_size=500, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(async_load_task) # Update num_items in case of DGM run if self.active_resident_threshold != 100: self.num_items = async_load_task.doc_index bucket.scopes[CbServer.default_scope].collections[ CbServer.default_collection].num_items = self.num_items # Verify doc load count self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets) self.sleep(30, "Wait for num_items to get reflected") current_items = self.bucket_util.get_bucket_current_item_count( self.cluster, bucket) if current_items == self.num_items: validation_results["post_upgrade_KV_infra"] = True else: self.log.error( "Mismatch in doc_count. Actual: %s, Expected: %s" % (current_items, self.num_items)) validation_results["post_upgrade_KV_infra"] = False self.log.info("Create CBAS infra post upgrade and check for data " "ingestion") if major_version >= 7.0: update_spec = { "no_of_dataverses": self.input.param('no_of_dv', 2), "no_of_datasets_per_dataverse": self.input.param('ds_per_dv', 4), "no_of_synonyms": self.input.param('no_of_synonym', 2), "no_of_indexes": self.input.param('no_of_index', 3), "max_thread_count": self.input.param('no_of_threads', 10), } else: update_spec = { "no_of_dataverses": self.input.param('no_of_dv', 2), "no_of_datasets_per_dataverse": self.input.param('ds_per_dv', 4), "no_of_synonyms": 0, "no_of_indexes": self.input.param('no_of_index', 3), "max_thread_count": self.input.param('no_of_threads', 10), "dataverse": { "cardinality": 1, "creation_method": "dataverse" }, "dataset": { "creation_methods": ["cbas_dataset"], "bucket_cardinality": 1 }, "index": { "creation_method": "index" } } if update_spec["no_of_dataverses"] * update_spec[ "no_of_datasets_per_dataverse"] > 8: self.log_failure("Total number of datasets across all " "dataverses cannot be more than 8 for pre " "7.0 builds") return False if self.cbas_setup(update_spec, False): validation_results["post_upgrade_cbas_infra"] = True else: validation_results["post_upgrade_cbas_infra"] = False if major_version >= 7.1: self.cluster.rest = RestConnection(self.cluster.master) def post_replica_activation_verification(): self.log.info("Verifying doc count accross all datasets") if not self.cbas_util.validate_docs_in_all_datasets( self.cluster, self.bucket_util, timeout=600): self.log_failure( "Docs are missing after replicas become active") validation_results["post_upgrade_replica_verification"] = \ False if update_spec["no_of_indexes"]: self.log.info("Verifying CBAS indexes are working") for idx in self.cbas_util.list_all_index_objs(): statement = "Select * from {0} where age > 5 limit 10".format( idx.full_dataset_name) if not self.cbas_util.verify_index_used( self.cluster, statement, index_used=True, index_name=idx.name): self.log.info( "Index {0} on dataset {1} was not used while " "executing query".format( idx.name, idx.full_dataset_name)) self.log.info("Marking one of the CBAS nodes as failed over.") self.available_servers, kv_failover_nodes, cbas_failover_nodes =\ self.rebalance_util.failover( self.cluster, kv_nodes=0, cbas_nodes=1, failover_type="Hard", action=None, timeout=7200, available_servers=[], exclude_nodes=[self.cluster.cbas_cc_node], kv_failover_nodes=None, cbas_failover_nodes=None, all_at_once=False) post_replica_activation_verification() self.available_servers, kv_failover_nodes, cbas_failover_nodes = \ self.rebalance_util.perform_action_on_failed_over_nodes( self.cluster, action=self.input.param('action_on_failover', "FullRecovery"), available_servers=self.available_servers, kv_failover_nodes=kv_failover_nodes, cbas_failover_nodes=cbas_failover_nodes) post_replica_activation_verification() validation_results["post_upgrade_replica_verification"] = True self.log.info("Delete the bucket created before upgrade") if self.bucket_util.delete_bucket(self.cluster, self.cluster.buckets[0], wait_for_bucket_deletion=True): validation_results["bucket_delete"] = True else: validation_results["bucket_delete"] = False if validation_results["bucket_delete"]: self.log.info("Check all datasets created on the deleted bucket " "are empty") results = [] for dataset in self.cbas_util.list_all_dataset_objs( dataset_source="internal"): if dataset.kv_bucket.name == "default": if self.cbas_util.wait_for_ingestion_complete( self.cluster, dataset.full_name, 0, timeout=300): results.append(True) else: results.append(False) validation_results["bucket_delete"] = all(results) for scenario in validation_results: if validation_results[scenario]: self.log.info("{0} : Passed".format(scenario)) else: self.log.info("{0} : Failed".format(scenario)) return validation_results def load_data_into_buckets(self, doc_loading_spec=None): """ Loads data into buckets using the data spec """ self.over_ride_spec_params = self.input.param("override_spec_params", "").split(";") # Init sdk_client_pool if not initialized before if self.sdk_client_pool is None: self.init_sdk_pool_object() self.doc_spec_name = self.input.param("doc_spec", "initial_load") # Create clients in SDK client pool if self.sdk_client_pool: self.log.info("Creating required SDK clients for client_pool") bucket_count = len(self.cluster.buckets) max_clients = self.task_manager.number_of_threads clients_per_bucket = int(ceil(max_clients / bucket_count)) for bucket in self.cluster.buckets: self.sdk_client_pool.create_clients( bucket, [self.cluster.master], clients_per_bucket, compression_settings=self.sdk_compression) if not doc_loading_spec: doc_loading_spec = self.bucket_util.get_crud_template_from_package( self.doc_spec_name) self.over_ride_doc_loading_template_params(doc_loading_spec) # MB-38438, adding CollectionNotFoundException in retry exception doc_loading_spec[MetaCrudParams.RETRY_EXCEPTIONS].append( SDKException.CollectionNotFoundException) doc_loading_task = self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_loading_spec, mutation_num=0, batch_size=self.batch_size) if doc_loading_task.result is False: self.fail("Initial reloading failed") # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) def over_ride_doc_loading_template_params(self, target_spec): for over_ride_param in self.over_ride_spec_params: if over_ride_param == "durability": target_spec[MetaCrudParams.DURABILITY_LEVEL] = \ self.durability_level elif over_ride_param == "sdk_timeout": target_spec[MetaCrudParams.SDK_TIMEOUT] = self.sdk_timeout elif over_ride_param == "doc_size": target_spec[MetaCrudParams.DocCrud.DOC_SIZE] = self.doc_size elif over_ride_param == "num_scopes": target_spec[MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET] = int( self.input.param("num_scopes", 1)) elif over_ride_param == "num_collections": target_spec[ MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES] = int( self.input.param("num_collections", 1)) elif over_ride_param == "num_items": target_spec["doc_crud"][MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS] = \ self.num_items def test_upgrade(self): self.log.info("Upgrading cluster nodes to target version") node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) if self.upgrade_type == "offline": self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version, True) else: self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version) self.cluster_util.print_cluster_stats(self.cluster) node_to_upgrade = self.fetch_node_to_upgrade() if not all(self.post_upgrade_validation().values()): self.fail("Post upgrade scenarios failed") def test_upgrade_with_failover(self): self.log.info("Upgrading cluster nodes to target version") node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) rest = RestConnection(node_to_upgrade) services = rest.get_nodes_services() services_on_target_node = services[(node_to_upgrade.ip + ":" + node_to_upgrade.port)] self.log.info( "Selected node services {0}".format(services_on_target_node)) if "cbas" in services_on_target_node: self.upgrade_function["failover_full_recovery"]( node_to_upgrade, False) else: self.upgrade_function[self.upgrade_type](node_to_upgrade) self.cluster_util.print_cluster_stats(self.cluster) node_to_upgrade = self.fetch_node_to_upgrade() if not all(self.post_upgrade_validation().values()): self.fail("Post upgrade scenarios failed")
class MultiNodeFailOver(CBASBaseTest): """ Class contains test cases for multiple analytics node failures.[CC+NC, NC+NC] """ def setUp(self): super(MultiNodeFailOver, self).setUp() self.log.info("Read the input params") self.nc_nc_fail_over = self.input.param("nc_nc_fail_over", True) self.create_secondary_indexes = self.input.param("create_secondary_indexes", False) # In this fail over we fail first 3 added cbas nodes[CC + first NC + Second NC] self.meta_data_node_failure = self.input.param("meta_data_node_failure", False) self.log.info("Add CBAS nodes to cluster") self.assertIsNotNone(self.cluster_util.add_node(self.cluster.cbas_nodes[0], services=["cbas"], rebalance=False), msg="Add node failed") self.assertIsNotNone(self.cluster_util.add_node(self.cluster.cbas_nodes[1], services=["cbas"], rebalance=True), msg="Add node failed") # This node won't be failed over if self.meta_data_node_failure: self.assertIsNotNone(self.cluster_util.add_node(self.cluster.cbas_nodes[2], services=["cbas"], rebalance=True), msg="Add node failed") self.log.info("Create connection") self.cbas_util.createConn(self.cb_bucket_name) self.log.info("Load documents in kv bucket") self.perform_doc_ops_in_all_cb_buckets("create", 0, self.num_items) self.log.info("Create dataset") self.cbas_util.create_dataset_on_bucket(self.cb_bucket_name, self.cbas_dataset_name) self.log.info("Create secondary index") if self.create_secondary_indexes: self.index_fields = "profession:string,number:bigint" create_idx_statement = "create index {0} on {1}({2});".format(self.index_name, self.cbas_dataset_name, self.index_fields) status, metrics, errors, results, _ = self.cbas_util.execute_statement_on_cbas_util(create_idx_statement) self.assertTrue(status == "success", "Create Index query failed") self.assertTrue(self.cbas_util.verify_index_created(self.index_name, self.index_fields.split(","), self.cbas_dataset_name)[0]) self.log.info("Connect Local link") self.cbas_util.connect_link() self.log.info("Validate dataset count") self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items) self.log.info("Pick nodes to fail over") self.fail_over_nodes = [] if self.nc_nc_fail_over: self.log.info("This is NC+NC fail over") self.fail_over_nodes.append(self.cluster.cbas_nodes[0]) self.fail_over_nodes.append(self.cluster.cbas_nodes[1]) self.neglect_failures = False else: self.log.info("This is NC+CC fail over") self.fail_over_nodes.append(self.cluster.cbas_nodes[0]) self.fail_over_nodes.append(self.cbas_node) self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[1], self.task) if self.meta_data_node_failure: self.fail_over_nodes.append(self.cluster.cbas_nodes[1]) self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[2], self.task) self.cbas_util.createConn(self.cb_bucket_name) self.neglect_failures = True def test_cbas_multi_node_fail_over(self): self.log.info("fail-over the node") fail_over_task = self._cb_cluster.async_failover(self.input.servers, self.fail_over_nodes) self.assertTrue(self.task_manager.get_task_result(fail_over_task), msg="Fail over of nodes failed") self.log.info("Rebalance remaining nodes") result = self.cluster_util.rebalance() self.assertTrue(result, "Rebalance operation failed") self.log.info("Validate dataset count") self.assertTrue(self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items), msg="Document count mismatch") def test_cbas_multi_node_fail_over_busy_system(self): self.log.info("Perform doc operation async") tasks = self.perform_doc_ops_in_all_cb_buckets( "create", start_key=self.num_items, end_key=self.num_items+(self.num_items/4), _async=True) self.log.info("Run concurrent queries to simulate busy system") statement = "select sleep(count(*),50000) from {0} where mutated=0;".format(self.cbas_dataset_name) try: self.cbas_util._run_concurrent_queries(statement, "async", 10, batch_size=10) except Exception as e: if self.neglect_failures: self.log.info("Neglecting failed queries, to handle node fail over CC") else: raise e self.log.info("fail-over the node") fail_over_task = self._cb_cluster.async_failover(self.input.servers, self.fail_over_nodes) self.assertTrue(self.task_manager.get_task_result(fail_over_task), msg="Fail over of nodes failed") self.log.info("Rebalance remaining nodes") result = self.cluster_util.rebalance() self.assertTrue(result, "Rebalance operation failed") for task in tasks: self.log.info(self.task_manager.get_task_result(task)) self.log.info("Validate dataset count") self.assertTrue(self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items + self.num_items/4), msg="Document count mismatch") def tearDown(self): super(MultiNodeFailOver, self).tearDown()
class CBASClusterOperations(CBASBaseTest): def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket":False}) self.rebalanceServers = None self.nodeType = "KV" self.wait_for_rebalance=True super(CBASClusterOperations, self).setUp() self.num_items = self.input.param("items", 1000) self.bucket_util.create_default_bucket() # self.cbas_util.createConn("default") if 'nodeType' in self.input.test_params: self.nodeType = self.input.test_params['nodeType'] self.rebalance_both = self.input.param("rebalance_cbas_and_kv", False) if not self.rebalance_both: if self.nodeType == "KV": self.rebalanceServers = self.cluster.kv_nodes self.wait_for_rebalance=False elif self.nodeType == "CBAS": self.rebalanceServers = [self.cbas_node] + self.cluster.cbas_nodes else: self.rebalanceServers = self.cluster.kv_nodes + [self.cbas_node] + self.cluster.cbas_nodes self.nodeType = "KV" + "-" +"CBAS" self.assertTrue(len(self.rebalanceServers)>1, "Not enough %s servers to run tests."%self.rebalanceServers) self.log.info("This test will be running in %s context."%self.nodeType) self.load_gen_tasks = [] def setup_for_test(self, skip_data_loading=False): if not skip_data_loading: # Load Couchbase bucket first. self.perform_doc_ops_in_all_cb_buckets("create", 0, self.num_items) self.cbas_util.createConn(self.cb_bucket_name) # Create dataset on the CBAS bucket self.cbas_util.create_dataset_on_bucket(cbas_bucket_name=self.cb_bucket_name, cbas_dataset_name=self.cbas_dataset_name, compress_dataset=self.compress_dataset) # Create indexes on the CBAS bucket self.create_secondary_indexes = self.input.param("create_secondary_indexes",False) if self.create_secondary_indexes: self.index_fields = "profession:string,number:bigint" create_idx_statement = "create index {0} on {1}({2});".format( self.index_name, self.cbas_dataset_name, self.index_fields) status, metrics, errors, results, _ = self.cbas_util.execute_statement_on_cbas_util( create_idx_statement) self.assertTrue(status == "success", "Create Index query failed") self.assertTrue( self.cbas_util.verify_index_created(self.index_name, self.index_fields.split(","), self.cbas_dataset_name)[0]) # Connect to Bucket self.cbas_util.connect_to_bucket(cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) if not skip_data_loading: # Validate no. of items in CBAS dataset if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket") def test_rebalance_in(self): ''' Description: This will test the rebalance in feature i.e. one node coming in to the cluster. Then Rebalance. Verify that is has no effect on the data ingested to cbas. Steps: 1. Setup cbas. bucket, datasets/shadows, connect. 2. Add a node and rebalance. Don't wait for rebalance completion. 3. During rebalance, do mutations and execute queries on cbas. Author: Ritesh Agarwal/Mihir Kamdar Date Created: 18/07/2017 ''' query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.cluster_util.add_node(node=self.rebalanceServers[1], rebalance=True, wait_for_rebalance_completion=self.wait_for_rebalance) self.log.info("Rebalance state:%s"%self.rest._rebalance_progress_status()) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.log.info("Rebalance state:%s"%self.rest._rebalance_progress_status()) self.cbas_util._run_concurrent_queries(query,None,2000,batch_size=self.concurrent_batch_size) self.log.info("Rebalance state:%s"%self.rest._rebalance_progress_status()) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 2, 0): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket") def test_rebalance_out(self): ''' Description: This will test the rebalance out feature i.e. one node going out of cluster. Then Rebalance. Steps: 1. Add a node, Rebalance. 2. Setup cbas. bucket, datasets/shadows, connect. 3. Remove a node and rebalance. Don't wait for rebalance completion. 4. During rebalance, do mutations and execute queries on cbas. Author: Ritesh Agarwal/Mihir Kamdar Date Created: 18/07/2017 ''' self.cluster_util.add_node(node=self.rebalanceServers[1]) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() otpnodes = [] nodes = self.rest.node_statuses() for node in nodes: if node.ip == self.rebalanceServers[1].ip: otpnodes.append(node) self.remove_node(otpnodes, wait_for_rebalance=self.wait_for_rebalance) self.log.info("Rebalance state:%s"%self.rest._rebalance_progress_status()) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.cbas_util._run_concurrent_queries(query,"immediate",2000,batch_size=self.concurrent_batch_size) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 2, 0): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket") def test_swap_rebalance(self): ''' Description: This will test the swap rebalance feature i.e. one node going out and one node coming in cluster. Then Rebalance. Verify that is has no effect on the data ingested to cbas. Steps: 1. Setup cbas. bucket, datasets/shadows, connect. 2. Add a node that is to be swapped against the leaving node. Do not rebalance. 3. Remove a node and rebalance. 4. During rebalance, do mutations and execute queries on cbas. Author: Ritesh Agarwal/Mihir Kamdar Date Created: 20/07/2017 ''' query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() otpnodes=[] nodes = self.rest.node_statuses() if self.nodeType == "KV": service = ["kv"] else: service = ["cbas"] otpnodes.append(self.cluster_util.add_node(node=self.servers[1], services=service)) self.cluster_util.add_node(node=self.servers[3], services=service,rebalance=False) self.remove_node(otpnodes, wait_for_rebalance=self.wait_for_rebalance) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.cbas_util._run_concurrent_queries(query,"immediate",2000,batch_size=self.concurrent_batch_size) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 2, 0): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket") def test_failover(self): ''' Description: This will test the node failover both graceful and hard failover based on graceful_failover param in testcase conf file. Steps: 1. Add node to the cluster which will be failed over. 2. Create docs, setup cbas. 3. Mark the node for fail over. 4. Do rebalance asynchronously. During rebalance perform mutations. 5. Run some CBAS queries. 6. Check for correct number of items in CBAS datasets. Author: Ritesh Agarwal/Mihir Kamdar Date Created: 20/07/2017 ''' #Add node which will be failed over later. self.cluster_util.add_node(node=self.rebalanceServers[1]) query = "select count(*) from {0};".format(self.cbas_dataset_name) graceful_failover = self.input.param("graceful_failover", False) self.setup_for_test() failover_task = self._cb_cluster.async_failover(self.input.servers, [self.rebalanceServers[1]], graceful_failover) self.task_manager.get_task_result(failover_task) result = self.cluster_util.rebalance() self.assertTrue(result, "Rebalance operation failed") self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 3 / 2) self.cbas_util._run_concurrent_queries(query,"immediate",2000,batch_size=self.concurrent_batch_size) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 3 / 2, 0): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket") ''' -i b/resources/4-nodes-template.ini -t cbas.cbas_cluster_operations.CBASClusterOperations.test_rebalance_in_cb_cbas_together,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=KV,rebalance_cbas_and_kv=True,wait_for_rebalace=False ''' def test_rebalance_in_cb_cbas_together(self): self.log.info("Creates cbas buckets and dataset") dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Rebalance in KV node") wait_for_rebalace_complete = self.input.param("wait_for_rebalace", False) self.cluster_util.add_node(node=self.rebalanceServers[1], rebalance=False, wait_for_rebalance_completion=wait_for_rebalace_complete) self.log.info("Rebalance in CBAS node") self.cluster_util.add_node(node=self.rebalanceServers[3], rebalance=True, wait_for_rebalance_completion=wait_for_rebalace_complete) self.log.info( "Perform document create as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.log.info( "Run queries as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) handles = self.cbas_util._run_concurrent_queries(dataset_count_query, None, 2000, batch_size=self.concurrent_batch_size) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 2, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") ''' -i b/resources/4-nodes-template.ini -t cbas.cbas_cluster_operations.CBASClusterOperations.test_rebalance_out_cb_cbas_together,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=KV,rebalance_cbas_and_kv=True,wait_for_rebalace=False ''' def test_rebalance_out_cb_cbas_together(self): self.log.info("Rebalance in KV node and wait for rebalance to complete") self.cluster_util.add_node(node=self.rebalanceServers[1]) self.log.info("Rebalance in CBAS node and wait for rebalance to complete") self.cluster_util.add_node(node=self.rebalanceServers[3]) self.log.info("Creates cbas buckets and dataset") dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Fetch and remove nodes to rebalance out") wait_for_rebalace_complete = self.input.param("wait_for_rebalace", False) otpnodes = [] nodes = self.rest.node_statuses() for node in nodes: if node.ip == self.rebalanceServers[1].ip or node.ip == self.rebalanceServers[3].ip: otpnodes.append(node) for every_node in otpnodes: self.remove_node(otpnodes, wait_for_rebalance=wait_for_rebalace_complete) self.sleep(30, message="Sleep for 30 seconds for remove node to complete") self.log.info( "Perform document create as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items, self.num_items * 2) self.log.info( "Run queries as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) handles = self.cbas_util._run_concurrent_queries(dataset_count_query, "immediate", 2000, batch_size=self.concurrent_batch_size) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 2, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") ''' -i b/resources/4-nodes-template.ini -t cbas.cbas_cluster_operations.CBASClusterOperations.test_swap_rebalance_cb_cbas_together,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,rebalance_cbas_and_kv=True,wait_for_rebalance=True ''' def test_swap_rebalance_cb_cbas_together(self): self.log.info("Creates cbas buckets and dataset") wait_for_rebalance = self.input.param("wait_for_rebalance", True) dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Add KV node and don't rebalance") self.cluster_util.add_node(node=self.rebalanceServers[1], rebalance=False) self.log.info("Add cbas node and don't rebalance") self.cluster_util.add_node(node=self.rebalanceServers[3], rebalance=False) otpnodes = [] nodes = self.rest.node_statuses() for node in nodes: if node.ip == self.rebalanceServers[0].ip or node.ip == self.rebalanceServers[2].ip: otpnodes.append(node) self.log.info("Remove master node") self.remove_node(otpnode=otpnodes, wait_for_rebalance=wait_for_rebalance) self.cluster.master = self.rebalanceServers[1] self.log.info("Create instances pointing to new master nodes") c_utils = CbasUtil(self.rebalanceServers[1], self.rebalanceServers[3], self.task) c_utils.createConn(self.cb_bucket_name) self.log.info("Create reference to SDK client") client = SDKClient(scheme="couchbase", hosts=[self.rebalanceServers[1].ip], bucket=self.cb_bucket_name, password=self.rebalanceServers[1].rest_password) self.log.info("Add more document to default bucket") documents = ['{"name":"value"}'] * (self.num_items//10) document_id_prefix = "custom-id-" client.insert_custom_json_documents(document_id_prefix, documents) self.log.info( "Run queries as rebalance is in progress : Rebalance state:%s" % self.rest._rebalance_progress_status()) handles = c_utils._run_concurrent_queries(dataset_count_query, "immediate", 2000, batch_size=self.concurrent_batch_size) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not c_utils.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items + (self.num_items//10) , 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") def test_rebalance_in_multiple_cbas_on_a_busy_system(self): node_services = [] node_services.append(self.input.param('service',"cbas")) self.log.info("Setup CBAS") self.setup_for_test(skip_data_loading=True) self.log.info("Run KV ops in async while rebalance is in progress") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items, start=0) tasks = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0) self.log.info("Run concurrent queries to simulate busy system") statement = "select sleep(count(*),50000) from {0} where mutated=0;".format(self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(statement, self.mode, self.num_concurrent_queries) self.log.info("Rebalance in CBAS nodes") self.cluster_util.add_node(node=self.rebalanceServers[1], services=node_services, rebalance=False, wait_for_rebalance_completion=False) self.cluster_util.add_node(node=self.rebalanceServers[3], services=node_services, rebalance=True, wait_for_rebalance_completion=True) self.log.info("Get KV ops result") for task in tasks: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") def test_rebalance_out_multiple_cbas_on_a_busy_system(self): node_services = [] node_services.append(self.input.param('service',"cbas")) self.log.info("Rebalance in CBAS nodes") self.cluster_util.add_node(node=self.rebalanceServers[1], services=node_services) self.cluster_util.add_node(node=self.rebalanceServers[3], services=node_services) self.log.info("Setup CBAS") self.setup_for_test(skip_data_loading=True) self.log.info("Run KV ops in async while rebalance is in progress") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items, start=0) tasks = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0) self.log.info("Run concurrent queries to simulate busy system") statement = "select sleep(count(*),50000) from {0} where mutated=0;".format(self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(statement, self.mode, self.num_concurrent_queries) self.log.info("Fetch and remove nodes to rebalance out") self.rebalance_cc = self.input.param("rebalance_cc", False) out_nodes = [] nodes = self.rest.node_statuses() if self.rebalance_cc: for node in nodes: if node.ip == self.cbas_node.ip or node.ip == self.servers[1].ip: out_nodes.append(node) self.cbas_util.closeConn() self.log.info("Reinitialize CBAS utils with ip %s, since CC node is rebalanced out" %self.servers[3].ip) self.cbas_util = CbasUtil(self.cluster.master, self.servers[3], self.task) self.cbas_util.createConn("default") else: for node in nodes: if node.ip == self.servers[3].ip or node.ip == self.servers[1].ip: out_nodes.append(node) self.log.info("Rebalance out CBAS nodes %s %s" % (out_nodes[0].ip, out_nodes[1].ip)) self.remove_all_nodes_then_rebalance([out_nodes[0],out_nodes[1]]) self.log.info("Get KV ops result") for task in tasks: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") ''' cbas.cbas_cluster_operations.CBASClusterOperations.test_rebalance_swap_multiple_cbas_on_a_busy_system,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,rebalance_cbas_and_kv=True,service=cbas,rebalance_cc=False cbas.cbas_cluster_operations.CBASClusterOperations.test_rebalance_swap_multiple_cbas_on_a_busy_system,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,rebalance_cbas_and_kv=True,service=cbas,rebalance_cc=True ''' def test_rebalance_swap_multiple_cbas_on_a_busy_system(self): ''' 1. We have 4 node cluster with 1 KV and 3 CBAS. Assume the IPS end with 101(KV), 102(CBAS), 103(CBAS), 104(CBAS) 2, Post initial setup - 101 running KV and 102 running CBAS as CC node 3. As part of test test add an extra NC node that we will swap rebalance later - Adding 103 and rebalance 4. If swap rebalance NC - then select the node added in #3 for remove and 104 to add during swap 5. If swap rebalance CC - then select the CC node added for remove and 104 to add during swap ''' self.log.info('Read service input param') node_services = [] node_services.append(self.input.param('service', "cbas")) self.log.info("Rebalance in CBAS nodes, this node will be removed during swap") self.cluster_util.add_node(node=self.rebalanceServers[1], services=node_services) self.log.info("Setup CBAS") self.setup_for_test(skip_data_loading=True) self.log.info("Run KV ops in async while rebalance is in progress") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items, start=0) tasks = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0) self.log.info("Run concurrent queries to simulate busy system") statement = "select sleep(count(*),50000) from {0} where mutated=0;".format(self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(statement, self.mode, self.num_concurrent_queries) self.log.info("Fetch node to remove during rebalance") self.rebalance_cc = self.input.param("rebalance_cc", False) out_nodes = [] nodes = self.rest.node_statuses() reinitialize_cbas_util = False for node in nodes: if self.rebalance_cc and (node.ip == self.cbas_node.ip): out_nodes.append(node) reinitialize_cbas_util = True elif not self.rebalance_cc and node.ip == self.rebalanceServers[1].ip: out_nodes.append(node) self.log.info("Swap rebalance CBAS nodes") self.cluster_util.add_node(node=self.rebalanceServers[3], services=node_services, rebalance=False) self.remove_node([out_nodes[0]], wait_for_rebalance=True) self.log.info("Get KV ops result") for task in tasks: self.task_manager.get_task_result(task) if reinitialize_cbas_util is True: self.cbas_util = CbasUtil(self.cluster.master, self.rebalanceServers[3], self.task) self.cbas_util.createConn("default") self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) count_n1ql = self.rest.query_tool('select count(*) from %s' % (self.cb_bucket_name))['results'][0]['$1'] if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, count_n1ql, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") ''' test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=True,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=True,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=True,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=False,recovery_strategy=full,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=True,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=False,recovery_strategy=delta,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=False,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=True,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=False,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=False,recovery_strategy=full,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=False,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=KV,rebalance_out=False,recovery_strategy=delta,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=False,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=CBAS,rebalance_out=True,concurrent_batch_size=500 test_fail_over_node_followed_by_rebalance_out_or_add_back,cb_bucket_name=default,graceful_failover=False,cbas_bucket_name=default_cbas,cbas_dataset_name=default_ds,items=10000,nodeType=CBAS,rebalance_out=False,recovery_strategy=full,concurrent_batch_size=500 ''' def test_fail_over_node_followed_by_rebalance_out_or_add_back(self): """ 1. Start with an initial setup, having 1 KV and 1 CBAS 2. Add a node that will be failed over - KV/CBAS 3. Create CBAS buckets and dataset 4. Fail over the KV node based in graceful_failover parameter specified 5. Rebalance out/add back based on input param specified in conf file 6. Perform doc operations 7. run concurrent queries 8. Verify document count on dataset post failover """ self.log.info("Add an extra node to fail-over") self.cluster_util.add_node(node=self.rebalanceServers[1]) self.log.info("Read the failure out type to be performed") graceful_failover = self.input.param("graceful_failover", True) self.log.info("Set up test - Create cbas buckets and data-sets") self.setup_for_test() self.log.info("Perform Async doc operations on KV") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items * 3 / 2, start=self.num_items) kv_task = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0) self.log.info("Run concurrent queries on CBAS") query = "select count(*) from {0};".format(self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", self.num_concurrent_queries, batch_size=self.concurrent_batch_size) self.log.info("fail-over the node") fail_task = self._cb_cluster.async_failover(self.input.servers, [self.rebalanceServers[1]], graceful_failover) self.task_manager.get_task_result(fail_task) self.log.info("Read input param to decide on add back or rebalance out") self.rebalance_out = self.input.param("rebalance_out", False) if self.rebalance_out: self.log.info("Rebalance out the fail-over node") result = self.cluster_util.rebalance() self.assertTrue(result, "Rebalance operation failed") else: self.recovery_strategy = self.input.param("recovery_strategy", "full") self.log.info("Performing %s recovery" % self.recovery_strategy) success = False end_time = datetime.datetime.now() + datetime.timedelta(minutes=int(1)) while datetime.datetime.now() < end_time or not success: try: self.sleep(10, message="Wait for fail over complete") self.rest.set_recovery_type('ns_1@' + self.rebalanceServers[1].ip, self.recovery_strategy) success = True except Exception: self.log.info("Fail over in progress. Re-try after 10 seconds.") pass if not success: self.fail("Recovery %s failed." % self.recovery_strategy) self.rest.add_back_node('ns_1@' + self.rebalanceServers[1].ip) result = self.cluster_util.rebalance() self.assertTrue(result, "Rebalance operation failed") self.log.info("Get KV ops result") for task in kv_task: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) self.log.info("Validate dataset count on CBAS") count_n1ql = self.rest.query_tool('select count(*) from `%s`' % self.cb_bucket_name)['results'][0]['$1'] if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, count_n1ql, 0, timeout=400, analytics_timeout=400): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") ''' test_to_fail_initial_rebalance_and_verify_subsequent_rebalance_succeeds,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=CBAS,num_queries=10,restart_couchbase_on_incoming_or_outgoing_node=True,rebalance_type=in test_to_fail_initial_rebalance_and_verify_subsequent_rebalance_succeeds,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=CBAS,num_queries=10,restart_couchbase_on_incoming_or_outgoing_node=True,rebalance_type=out test_to_fail_initial_rebalance_and_verify_subsequent_rebalance_succeeds,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=CBAS,num_queries=10,restart_couchbase_on_incoming_or_outgoing_node=True,rebalance_type=swap ''' def test_to_fail_initial_rebalance_and_verify_subsequent_rebalance_succeeds(self): self.log.info("Pick the incoming and outgoing nodes during rebalance") self.rebalance_type = self.input.param("rebalance_type", "in") nodes_to_add = [self.rebalanceServers[1]] nodes_to_remove = [] reinitialize_cbas_util = False if self.rebalance_type == 'out': nodes_to_remove.append(self.rebalanceServers[1]) self.cluster_util.add_node(self.rebalanceServers[1]) nodes_to_add = [] elif self.rebalance_type == 'swap': self.cluster_util.add_node(nodes_to_add[0], rebalance=False) nodes_to_remove.append(self.cbas_node) reinitialize_cbas_util = True self.log.info("Incoming nodes - %s, outgoing nodes - %s. For rebalance type %s " %(nodes_to_add, nodes_to_remove, self.rebalance_type)) self.log.info("Creates cbas buckets and dataset") dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Perform async doc operations on KV") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items * 3 / 2, start=self.num_items) kv_task = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0, batch_size=5000) self.log.info("Run concurrent queries on CBAS") handles = self.cbas_util._run_concurrent_queries(dataset_count_query, "async", self.num_concurrent_queries) self.log.info("Fetch the server to restart couchbase on") restart_couchbase_on_incoming_or_outgoing_node = self.input.param("restart_couchbase_on_incoming_or_outgoing_node", True) if not restart_couchbase_on_incoming_or_outgoing_node: node = self.cbas_node else: node = self.rebalanceServers[1] shell = RemoteMachineShellConnection(node) self.log.info("Rebalance nodes") self.task.async_rebalance(self.servers, nodes_to_add, nodes_to_remove) self.log.info("Restart Couchbase on node %s" % node.ip) shell.restart_couchbase() self.sleep(30, message="Waiting for service to be back again...") self.log.info("Verify subsequent rebalance is successful") nodes_to_add = [] # Node is already added to cluster in previous rebalance, adding it again will throw exception self.assertTrue(self.task.rebalance(self.servers, nodes_to_add, nodes_to_remove)) if reinitialize_cbas_util is True: self.cbas_util = CbasUtil(self.cluster.master, self.rebalanceServers[1], self.task) self.cbas_util.createConn("default") self.cbas_util.wait_for_cbas_to_recover() self.log.info("Get KV ops result") for task in kv_task: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) self.log.info("Validate dataset count on CBAS") if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 3 / 2, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") def test_auto_retry_failed_rebalance(self): # Auto-retry rebalance settings body = {"enabled": "true", "afterTimePeriod": self.retry_time, "maxAttempts": self.num_retries} rest = RestConnection(self.cluster.master) rest.set_retry_rebalance_settings(body) result = rest.get_retry_rebalance_settings() self.log.info("Pick the incoming and outgoing nodes during rebalance") self.rebalance_type = self.input.param("rebalance_type", "in") nodes_to_add = [self.rebalanceServers[1]] nodes_to_remove = [] reinitialize_cbas_util = False if self.rebalance_type == 'out': nodes_to_remove.append(self.rebalanceServers[1]) self.cluster_util.add_node(self.rebalanceServers[1]) nodes_to_add = [] elif self.rebalance_type == 'swap': self.cluster_util.add_node(nodes_to_add[0], rebalance=False) nodes_to_remove.append(self.cbas_node) reinitialize_cbas_util = True self.log.info("Incoming nodes - %s, outgoing nodes - %s. For rebalance type %s " % ( nodes_to_add, nodes_to_remove, self.rebalance_type)) self.log.info("Creates cbas buckets and dataset") dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Perform async doc operations on KV") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items * 3 / 2, start=self.num_items) kv_task = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0, batch_size=5000) self.log.info("Run concurrent queries on CBAS") handles = self.cbas_util._run_concurrent_queries(dataset_count_query, "async", self.num_concurrent_queries) self.log.info("Fetch the server to restart couchbase on") restart_couchbase_on_incoming_or_outgoing_node = self.input.param( "restart_couchbase_on_incoming_or_outgoing_node", True) if not restart_couchbase_on_incoming_or_outgoing_node: node = self.cbas_node else: node = self.rebalanceServers[1] shell = RemoteMachineShellConnection(node) try: self.log.info("Rebalance nodes") self.task.async_rebalance(self.servers, nodes_to_add, nodes_to_remove) self.sleep(10, message="Restarting couchbase after 10s on node %s" % node.ip) shell.restart_couchbase() self.sleep(30, message="Waiting for service to be back again...") self.sleep(self.retry_time, "Wait for retry time to complete and then check the rebalance results") reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.log.info("Rebalance status : {0}".format(reached)) self.sleep(20) self._check_retry_rebalance_succeeded() if reinitialize_cbas_util is True: self.cbas_util = CbasUtil(self.cluster.master, self.rebalanceServers[1], self.task) self.cbas_util.createConn("default") self.cbas_util.wait_for_cbas_to_recover() self.log.info("Get KV ops result") for task in kv_task: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) self.log.info("Validate dataset count on CBAS") if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 3 / 2, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") except Exception as e: self.fail("Some exception occurred : {0}".format(e.message)) finally: body = {"enabled": "false"} rest.set_retry_rebalance_settings(body) ''' test_rebalance_on_nodes_running_multiple_services,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=KV,num_queries=10,rebalance_type=in test_rebalance_on_nodes_running_multiple_services,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,nodeType=KV,num_queries=10,rebalance_type=out test_rebalance_on_nodes_running_multiple_services,cb_bucket_name=default,cbas_bucket_name=default_bucket,cbas_dataset_name=default_ds,items=10,num_queries=10,rebalance_type=swap,rebalance_cbas_and_kv=True ''' def test_rebalance_on_nodes_running_multiple_services(self): self.log.info("Pick the incoming and outgoing nodes during rebalance") active_services = ['cbas,fts,kv'] self.rebalance_type = self.input.param("rebalance_type", "in") nodes_to_add = [self.rebalanceServers[1]] nodes_to_remove = [] if self.rebalance_type == 'out': # This node will be rebalanced out nodes_to_remove.append(self.rebalanceServers[1]) # Will be running services as specified in the list - active_services self.cluster_util.add_node(nodes_to_add[0], services=active_services) # No nodes to remove so making the add notes empty nodes_to_add = [] elif self.rebalance_type == 'swap': # Below node will be swapped with the incoming node specified in nodes_to_add self.cluster_util.add_node(nodes_to_add[0], services=active_services) nodes_to_add = [] nodes_to_add.append(self.rebalanceServers[3]) # Below node will be removed and swapped with node that was added earlier nodes_to_remove.append(self.rebalanceServers[1]) self.log.info("Incoming nodes - %s, outgoing nodes - %s. For rebalance type %s " % ( nodes_to_add, nodes_to_remove, self.rebalance_type)) self.log.info("Creates cbas buckets and dataset") dataset_count_query = "select count(*) from {0};".format(self.cbas_dataset_name) self.setup_for_test() self.log.info("Perform async doc operations on KV") json_generator = JsonGenerator() generators = json_generator.generate_docs_simple(docs_per_day=self.num_items * 3 / 2, start=self.num_items) kv_task = self.bucket_util._async_load_all_buckets(self.cluster, generators, "create", 0, batch_size=5000) self.log.info("Run concurrent queries on CBAS") handles = self.cbas_util._run_concurrent_queries(dataset_count_query, "async", self.num_concurrent_queries) self.log.info("Rebalance nodes") # Do not add node to nodes_to_add if already added as add_node earlier self.task.rebalance(self.servers, nodes_to_add, nodes_to_remove, services=active_services) self.log.info("Get KV ops result") for task in kv_task: self.task_manager.get_task_result(task) self.log.info("Log concurrent query status") self.cbas_util.log_concurrent_query_outcome(self.cluster.master, handles) self.log.info("Validate dataset count on CBAS") if not self.cbas_util.validate_cbas_dataset_items_count(self.cbas_dataset_name, self.num_items * 3 / 2, 0): self.fail("No. of items in CBAS dataset do not match that in the CB bucket") def tearDown(self): super(CBASClusterOperations, self).tearDown() def _check_retry_rebalance_succeeded(self): rest = RestConnection(self.cluster.master) result = json.loads(rest.get_pending_rebalance_info()) self.log.info(result) if "retry_rebalance" in result and result["retry_rebalance"] != "not_pending": retry_after_secs = result["retry_after_secs"] attempts_remaining = result["attempts_remaining"] retry_rebalance = result["retry_rebalance"] self.log.info("Attempts remaining : {0}, Retry rebalance : {1}".format(attempts_remaining, retry_rebalance)) while attempts_remaining: # wait for the afterTimePeriod for the failed rebalance to restart self.sleep(retry_after_secs, message="Waiting for the afterTimePeriod to complete") try: result = self.rest.monitorRebalance() msg = "monitoring rebalance {0}" self.log.info(msg.format(result)) except Exception: result = json.loads(self.rest.get_pending_rebalance_info()) self.log.info(result) try: attempts_remaining = result["attempts_remaining"] retry_rebalance = result["retry_rebalance"] retry_after_secs = result["retry_after_secs"] except KeyError: self.fail("Retrying of rebalance still did not help. All the retries exhausted...") self.log.info("Attempts remaining : {0}, Retry rebalance : {1}".format(attempts_remaining, retry_rebalance)) else: self.log.info("Retry rebalanced fixed the rebalance failure") break
class MetadataReplication(CBASBaseTest): def tearDown(self): CBASBaseTest.tearDown(self) def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket": False}) super(MetadataReplication, self).setUp() self.nc_otpNodes = [] if "add_all_cbas_nodes" in self.input.test_params and self.input.test_params[ "add_all_cbas_nodes"] and len(self.cluster.cbas_nodes) > 0: self.nc_otpNodes = self.add_all_nodes_then_rebalance( self.cluster.cbas_nodes) elif self.input.param("nc_nodes_to_add", 0): self.nc_otpNodes = self.add_all_nodes_then_rebalance( self.cluster.cbas_nodes[:self.input.param("nc_nodes_to_add")]) self.otpNodes += self.nc_otpNodes self.bucket_util.create_default_bucket(self.cluster, storage=self.bucket_storage) self.cbas_util.createConn("default") self.shell = RemoteMachineShellConnection(self.cluster.master) #test for number of partitions: self.partitions_dict = self.cbas_util.get_num_partitions(self.shell) # if self.cluster.master.cbas_path: # for key in self.partitions_dict.keys(): # self.assertTrue(self.partitions_dict[key] == len(ast.literal_eval(self.cluster.master.cbas_path)), "Number of partitions created are incorrect on cbas nodes.") def setup_for_test(self, skip_data_loading=False): if not skip_data_loading: # Load Couchbase bucket first. self.perform_doc_ops_in_all_cb_buckets("create", 0, self.num_items, batch_size=1000) # Create dataset on the CBAS bucket self.cbas_util.create_dataset_on_bucket( cbas_bucket_name=self.cb_bucket_name, cbas_dataset_name=self.cbas_dataset_name) # Create indexes on the CBAS bucket self.create_secondary_indexes = self.input.param( "create_secondary_indexes", False) if self.create_secondary_indexes: self.index_fields = "profession:string,number:bigint" create_idx_statement = "create index {0} on {1}({2});".format( self.index_name, self.cbas_dataset_name, self.index_fields) status, metrics, errors, results, _ = self.cbas_util.execute_statement_on_cbas_util( create_idx_statement) self.assertTrue(status == "success", "Create Index query failed") self.assertTrue( self.cbas_util.verify_index_created( self.index_name, self.index_fields.split(","), self.cbas_dataset_name)[0]) # Connect to Bucket self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) if not skip_data_loading: # Validate no. of items in CBAS dataset if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def ingestion_in_progress(self): self.cbas_util.disconnect_from_bucket(self.cbas_bucket_name) self.perform_doc_ops_in_all_cb_buckets("create", 0, self.num_items * 2, batch_size=1000) self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) def ingest_more_data(self): self.cbas_util.disconnect_from_bucket(self.cbas_bucket_name) self.perform_doc_ops_in_all_cb_buckets("create", self.num_items * 2, self.num_items * 4, batch_size=1000) self.cbas_util.connect_to_bucket( cbas_bucket_name=self.cbas_bucket_name, cb_bucket_password=self.cb_bucket_password) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 4): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) def test_rebalance(self): self.setup_for_test(skip_data_loading=True) self.rebalance_type = self.input.param('rebalance_type', 'out') self.rebalance_node = self.input.param('rebalance_node', 'CC') self.how_many = self.input.param('how_many', 1) self.restart_rebalance = self.input.param('restart_rebalance', False) self.replica_change = self.input.param('replica_change', 0) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() otpNodes = [] if self.rebalance_node == "CC": node_in_test = [self.cbas_node] otpNodes = [self.otpNodes[0]] self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") self.cbas_node = self.cluster.cbas_nodes[0] elif self.rebalance_node == "NC": node_in_test = self.cluster.cbas_nodes[:self.how_many] otpNodes = self.nc_otpNodes[:self.how_many] else: node_in_test = [self.cbas_node ] + self.cluster.cbas_nodes[:self.how_many] otpNodes = self.otpNodes[:self.how_many + 1] self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[self.how_many]) self.cbas_util.createConn("default") replicas_before_rebalance = len( self.cbas_util.get_replicas_info(self.shell)) if self.rebalance_type == 'in': if self.restart_rebalance: self.cluster_util.add_all_nodes_then_rebalance( self.cluster, self.cluster.cbas_nodes[ self.input.param("nc_nodes_to_add"):self.how_many + self.input.param("nc_nodes_to_add")], wait_for_completion=False) self.sleep(2) if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") self.sleep( 30, "Wait for some tine after rebalance is stopped.") else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) self.rebalance(wait_for_completion=False) else: self.cluster_util.add_all_nodes_then_rebalance( self.cluster, self.cluster.cbas_nodes[ self.input.param("nc_nodes_to_add"):self.how_many + self.input.param("nc_nodes_to_add")], wait_for_completion=False) replicas_before_rebalance += self.replica_change else: if self.restart_rebalance: self.cluster_util.remove_node(self.cluster, otpNodes, wait_for_rebalance=False) self.sleep(2) if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") self.sleep( 30, "Wait for some tine after rebalance is stopped.") else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) self.rebalance(wait_for_completion=False, ejected_nodes=[node.id for node in otpNodes]) else: self.cluster_util.remove_node(self.cluster, otpNodes, wait_for_rebalance=False) replicas_before_rebalance -= self.replica_change self.sleep(30) str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: replicas = self.cbas_util.get_replicas_info(self.shell) if replicas: for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.sleep(2) replicas = self.cbas_util.get_replicas_info(self.shell) replicas_after_rebalance = len(replicas) self.assertEqual( replicas_after_rebalance, replicas_before_rebalance, "%s,%s" % (replicas_after_rebalance, replicas_before_rebalance)) for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("Items before service restart: %s"%items_in_cbas_bucket) count = 0 while self.cbas_util.fetch_analytics_cluster_response( )['state'] != "ACTIVE" and count < 60: self.sleep(5) count += 1 items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test[0]) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.rebalance_node == "NC": self.assertTrue(aborted_count == 0, "Some queries aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data() def test_cancel_CC_rebalance(self): pass def test_chain_rebalance_out_cc(self): self.setup_for_test(skip_data_loading=True) self.ingestion_in_progress() total_cbas_nodes = len(self.otpNodes) while total_cbas_nodes > 1: cc_ip = self.cbas_util.retrieve_cc_ip(shell=self.shell) for otpnode in self.otpNodes: if otpnode.ip == cc_ip: self.cluster_util.remove_node(self.cluster, [otpnode], wait_for_rebalance=True) for server in self.cluster.cbas_nodes: if cc_ip != server.ip: self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, server) self.cbas_util.createConn("default") self.cbas_node = server break # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("Items before service restart: %s"%items_in_cbas_bucket) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info( "After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info( "Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0." ) else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) query = "select count(*) from {0};".format( self.cbas_dataset_name) self.cbas_util._run_concurrent_queries( query, "immediate", 10) break total_cbas_nodes -= 1 if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data() def test_cc_swap_rebalance(self): self.restart_rebalance = self.input.param('restart_rebalance', False) self.setup_for_test(skip_data_loading=True) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() replicas_before_rebalance = len( self.cbas_util.get_replicas_info(self.shell)) self.cluster_util.add_node(node=self.cluster.cbas_nodes[-1], rebalance=False) swap_nc = self.input.param('swap_nc', False) if not swap_nc: out_nodes = [self.otpNodes[0]] self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") self.cbas_node = self.cluster.cbas_nodes[0] else: out_nodes = [self.otpNodes[1]] self.cluster_util.remove_node(self.cluster, out_nodes, wait_for_rebalance=False) self.sleep(5, "Wait for sometime after rebalance started.") if self.restart_rebalance: if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") self.sleep(10) else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) self.rebalance(ejected_nodes=[node.id for node in out_nodes], wait_for_completion=False) self.sleep(5) str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: replicas = self.cbas_util.get_replicas_info(self.shell) if replicas: for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.sleep(30) replicas = self.cbas_util.get_replicas_info(self.shell) replicas_after_rebalance = len(replicas) self.assertEqual( replicas_after_rebalance, replicas_before_rebalance, "%s,%s" % (replicas_after_rebalance, replicas_before_rebalance)) for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) # items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(self.cbas_dataset_name) # self.log.info("Items before service restart: %s"%items_in_cbas_bucket) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(self.cluster.master) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( self.cluster.master, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data() def test_reboot_nodes(self): #Test for reboot CC and reboot all nodes. self.setup_for_test(skip_data_loading=True) self.ingestion_in_progress() self.node_type = self.input.param('node_type', 'CC') replica_nodes_before_reboot = self.cbas_util.get_replicas_info( self.shell) replicas_before_reboot = len( self.cbas_util.get_replicas_info(self.shell)) if self.node_type == "CC": shell = RemoteMachineShellConnection(self.cbas_node) shell.reboot_server_and_wait_for_cb_run(self.cluster_util, self.cbas_node) shell.disconnect() elif self.node_type == "NC": for server in self.cluster.cbas_nodes: shell = RemoteMachineShellConnection(server) shell.reboot_server_and_wait_for_cb_run( self.cluster_util, server) shell.disconnect() else: shell = RemoteMachineShellConnection(self.cbas_node) shell.reboot_server_and_wait_for_cb_run(self.cluster_util, self.cbas_node) shell.disconnect() for server in self.cluster.cbas_nodes: shell = RemoteMachineShellConnection(server) shell.reboot_server_and_wait_for_cb_run( self.cluster_util, server) shell.disconnect() self.sleep(60) replica_nodes_after_reboot = self.cbas_util.get_replicas_info( self.shell) replicas_after_reboot = len(replica_nodes_after_reboot) self.assertTrue( replica_nodes_after_reboot == replica_nodes_before_reboot, "Replica nodes changed after reboot. Before: %s , After : %s" % (replica_nodes_before_reboot, replica_nodes_after_reboot)) self.assertTrue( replicas_after_reboot == replicas_before_reboot, "Number of Replica nodes changed after reboot. Before: %s , After : %s" % (replicas_before_reboot, replicas_after_reboot)) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) for replica in replica_nodes_after_reboot: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) self.ingest_more_data() def test_failover(self): self.setup_for_test(skip_data_loading=True) self.rebalance_node = self.input.param('rebalance_node', 'CC') self.how_many = self.input.param('how_many', 1) self.restart_rebalance = self.input.param('restart_rebalance', False) self.replica_change = self.input.param('replica_change', 0) self.add_back = self.input.param('add_back', False) query = "select sleep(count(*),50000) from {0};".format( self.cbas_dataset_name) handles = self.cbas_util._run_concurrent_queries(query, "async", 10) self.ingestion_in_progress() if self.rebalance_node == "CC": node_in_test = [self.cbas_node] otpNodes = [self.otpNodes[0]] self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[0]) self.cbas_util.createConn("default") self.cbas_node = self.cluster.cbas_nodes[0] elif self.rebalance_node == "NC": node_in_test = self.cluster.cbas_nodes[:self.how_many] otpNodes = self.nc_otpNodes[:self.how_many] else: node_in_test = [self.cbas_node ] + self.cluster.cbas_nodes[:self.how_many] otpNodes = self.otpNodes[:self.how_many + 1] self.cbas_util.closeConn() self.cbas_util = CbasUtil(self.cluster.master, self.cluster.cbas_nodes[self.how_many]) self.cbas_util.createConn("default") replicas_before_rebalance = len( self.cbas_util.get_replicas_info(self.shell)) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("Items before failover node: %s" % items_in_cbas_bucket) if self.restart_rebalance: graceful_failover = self.input.param("graceful_failover", False) failover_task = self._cb_cluster.async_failover( self.input.servers, node_in_test, graceful_failover) self.task_manager.get_task_result(failover_task) if self.add_back: for otpnode in otpNodes: self.rest.set_recovery_type('ns_1@' + otpnode.ip, "full") self.rest.add_back_node('ns_1@' + otpnode.ip) self.rebalance(wait_for_completion=False) else: self.rebalance(ejected_nodes=[node.id for node in otpNodes], wait_for_completion=False) self.sleep(2) if self.rest._rebalance_progress_status() == "running": self.assertTrue(self.rest.stop_rebalance(wait_timeout=120), "Failed while stopping rebalance.") if self.add_back: self.rebalance(wait_for_completion=False) else: self.rebalance( ejected_nodes=[node.id for node in otpNodes], wait_for_completion=False) else: self.fail( "Rebalance completed before the test could have stopped rebalance." ) else: graceful_failover = self.input.param("graceful_failover", False) failover_task = self._cb_cluster.async_failover( self.input.servers, node_in_test, graceful_failover) self.task_manager.get_task_result(failover_task) if self.add_back: for otpnode in otpNodes: self.rest.set_recovery_type('ns_1@' + otpnode.ip, "full") self.rest.add_back_node('ns_1@' + otpnode.ip) self.rebalance(wait_for_completion=False) replicas_before_rebalance -= self.replica_change self.sleep(5) str_time = time.time() while self.rest._rebalance_progress_status( ) == "running" and time.time() < str_time + 300: replicas = self.cbas_util.get_replicas_info(self.shell) if replicas: for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.sleep(15) replicas = self.cbas_util.get_replicas_info(self.shell) replicas_after_rebalance = len(replicas) self.assertEqual( replicas_after_rebalance, replicas_before_rebalance, "%s,%s" % (replicas_after_rebalance, replicas_before_rebalance)) for replica in replicas: self.log.info("replica state during rebalance: %s" % replica['status']) self.assertEqual( replica['status'], "IN_SYNC", "Replica state is incorrect: %s" % replica['status']) items_in_cbas_bucket = 0 start_time = time.time() while (items_in_cbas_bucket == 0 or items_in_cbas_bucket == -1) and time.time() < start_time + 60: try: items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset( self.cbas_dataset_name) except: pass self.sleep(1) self.log.info("After rebalance operation docs in CBAS bucket : %s" % items_in_cbas_bucket) if items_in_cbas_bucket < self.num_items * 2 and items_in_cbas_bucket > self.num_items: self.log.info("Data Ingestion Interrupted successfully") elif items_in_cbas_bucket < self.num_items: self.log.info( "Data Ingestion did interrupted and restarting from 0.") else: self.log.info( "Data Ingestion did not interrupted but complete before rebalance operation." ) run_count = 0 fail_count = 0 success_count = 0 aborted_count = 0 shell = RemoteMachineShellConnection(node_in_test[0]) for handle in handles: status, hand = self.cbas_util.retrieve_request_status_using_handle( node_in_test, handle, shell) if status == "running": run_count += 1 self.log.info("query with handle %s is running." % handle) elif status == "failed": fail_count += 1 self.log.info("query with handle %s is failed." % handle) elif status == "success": success_count += 1 self.log.info("query with handle %s is successful." % handle) else: aborted_count += 1 self.log.info("Queued job is deleted: %s" % status) self.log.info("After service restart %s queued jobs are Running." % run_count) self.log.info("After service restart %s queued jobs are Failed." % fail_count) self.log.info("After service restart %s queued jobs are Successful." % success_count) self.log.info("After service restart %s queued jobs are Aborted." % aborted_count) if self.rebalance_node == "NC": self.assertTrue(aborted_count == 0, "Some queries aborted") query = "select count(*) from {0};".format(self.cbas_dataset_name) self.cbas_util._run_concurrent_queries(query, "immediate", 100) if not self.cbas_util.validate_cbas_dataset_items_count( self.cbas_dataset_name, self.num_items * 2): self.fail( "No. of items in CBAS dataset do not match that in the CB bucket" ) self.ingest_more_data()