def test_bulk_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" if doc_ops[0] == "create": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "update": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "replace": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "delete": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = "NONE" for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: # This will support both sync-write and non-sync-writes doc_loader_task_2 = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], doc_ops[1], 0, scope=s_name, collection=c_name, sdk_client_pool=self.sdk_client_pool, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=3, print_ops_rate=False, skip_read_on_error=True, task_identifier="parallel_task2") self.task.jython_task_manager.get_task_result( doc_loader_task_2) # Validation to verify the sync_in_write_errors # in doc_loader_task_2 failed_docs = doc_loader_task_2.fail if len(failed_docs.keys()) != 1: self.log_failure( "Exception not seen for docs: %s" % failed_docs) valid_exception = self.durability_helper\ .validate_durability_exception( failed_docs, SDKException.AmbiguousTimeoutException, retry_reason=sync_write_in_progress) if not valid_exception: self.log_failure("Got invalid exception") # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops[0] == "update": for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], "read", batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 \ and json.loads(str(doc_info["value"]) )["mutated"] != 1: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Validate doc_count per collection self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
def test_sub_doc_sync_write_in_progress(self): """ Test to simulate sync_write_in_progress error and validate the behavior This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select nodes to simulate the error which will affect the durability 2. Enable the specified error_scenario on the selected nodes 3. Perform individual CRUDs and verify sync_write_in_progress errors 4. Validate the end results """ doc_ops = self.input.param("doc_ops", "insert") shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() vb_info["init"] = dict() doc_load_spec = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 amb_timeout = SDKException.AmbiguousTimeoutException kv_sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS doc_not_found_exception = SDKException.DocumentNotFoundException self.load_data_for_sub_doc_ops() doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 # Acquire SDK client from the pool for performing doc_ops locally client = self.sdk_client_pool.get_client_for_bucket(self.bucket) # Override the crud_batch_size self.crud_batch_size = 5 # Update mutation spec based on the required doc_operation if doc_ops == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops in DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.INSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.UPSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.REMOVE: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 1 # This is to support both sync-write and non-sync-writes tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = Bucket.DurabilityLevel.NONE # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error simulation to take effect") # Initialize tasks and store the task objects doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, mutation_num=2, batch_size=1, async_load=True) # Start the doc_loader_task self.sleep(10, "Wait for task_1 CRUDs to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: key, _ = c_meta[op_type]["doc_gen"].next() expected_exception = amb_timeout retry_reason = kv_sync_write_in_progress if doc_ops == "create": expected_exception = doc_not_found_exception retry_reason = None for sub_doc_op in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: val = ["my_mutation", "val"] if sub_doc_op \ == DocLoading.Bucket.SubDocOps.REMOVE: val = "mutated" result = client.crud(sub_doc_op, key, val, durability=tem_durability, timeout=2) if result[0]: self.log_failure("Doc crud succeeded for %s" % op_type) elif expected_exception \ not in str(result[1][key]["error"]): self.log_failure( "Invalid exception for key %s: %s" % (key, result[1][key]["error"])) elif retry_reason is not None and \ retry_reason \ not in str(result[1][key]["error"]): self.log_failure( "Retry reason missing for key %s: %s" % (key, result[1][key]["error"])) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loader_task_1 to complete self.task.jython_task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops == DocLoading.Bucket.DocOps.UPDATE: for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: c_meta[op_type]["doc_gen"].reset() read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], DocLoading.Bucket.DocOps.READ, batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 and \ json.loads(str(doc_info["value"]) )["mutated"] != 2: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Release the acquired SDK client self.sdk_client_pool.release_client(client) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def test_sub_doc_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] self.load_data_for_sub_doc_ops() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() # load_spec["target_vbuckets"] = list(set(target_vbuckets) # ^ set(active_vbs_in_target_nodes)) load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 25 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 25 self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Sub_doc CRUDs failed with process crash") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats not updated after error condition" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops", "create;create").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() # Acquire SDK client from the pool for performing doc_ops locally client = SDKClient([self.cluster.master], self.bucket) target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = replica_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(replica_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.REPLACE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(3, "Wait for error simulation to take effect") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): client.select_collection(s_name, c_name) for op_type in c_meta: key, value = c_meta[op_type]["doc_gen"].next() if self.with_non_sync_writes: fail = client.crud(doc_ops[1], key, value, exp=0, timeout=2, time_unit="seconds") else: fail = client.crud( doc_ops[1], key, value, exp=0, durability=self.durability_level, timeout=2, time_unit="seconds") expected_exception = \ SDKException.AmbiguousTimeoutException retry_reason = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE \ and doc_ops[1] in \ [DocLoading.Bucket.DocOps.DELETE, DocLoading.Bucket.DocOps.REPLACE]: expected_exception = \ SDKException.DocumentNotFoundException retry_reason = None # Validate the returned error from the SDK if expected_exception not in str(fail["error"]): self.log_failure("Invalid exception for %s: %s" % (key, fail["error"])) if retry_reason \ and retry_reason not in str(fail["error"]): self.log_failure( "Invalid retry reason for %s: %s" % (key, fail["error"])) # Try reading the value in SyncWrite state fail = client.crud("read", key) if doc_ops[0] == "create": # Expected KeyNotFound in case of CREATE op if fail["status"] is True: self.log_failure( "%s returned value during SyncWrite %s" % (key, fail)) else: # Expects prev val in case of other operations if fail["status"] is False: self.log_failure( "Key %s read failed for prev value: %s" % (key, fail)) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Release the acquired SDK client client.close() self.validate_test_failure()
def test_sub_doc_with_persistence_issues(self): """ 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations met the durability condition """ if self.durability_level.upper() in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] load_spec = dict() load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 20 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 10 self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) # Create new docs for sub-doc operations to run self.load_data_for_sub_doc_ops() self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform CRUDs with induced error scenario is active self.log.info("Perform 'insert', 'upsert', 'remove' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=0, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=1) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] self.assertTrue(val, msg="Failover stats not updated") # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 (Positive case) self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["target_vbuckets"] = list( set(range(0, 1024)) ^ set(active_vbs_in_target_nodes)) self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud() # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with process crash") if self.simulate_error \ not in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # In case of error with Ephemeral bucket, need to rebalance # to make sure data is redistributed properly if self.bucket_type == Bucket.Type.EPHEMERAL: retry_num = 0 result = None while retry_num != 2: result = self.task.rebalance( self.servers[0:self.nodes_init], [], []) if result: break retry_num += 1 self.sleep(10, "Wait before retrying rebalance") self.assertTrue(result, "Rebalance failed") # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover stat validation if self.simulate_error == CouchbaseError.KILL_MEMCACHED: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: if self.simulate_error != CouchbaseError.STOP_MEMCACHED \ and self.bucket_type == Bucket.Type.EPHEMERAL: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: val = failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats mismatch after error condition:" \ " %s != %s" \ % (failover_info["init"][node.ip], failover_info["afterCrud"][node.ip]) self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Doc count validation self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_with_persistence_issues(self): """ Test to make sure timeout is handled in durability calls and document CRUDs are successful even with disk related failures 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout value is considered as 'seconds' """ if self.durability_level in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim = DiskError(self.log, self.task_manager, self.cluster.master, target_nodes, 60, 0, False, 120, disk_location="/data") error_sim.create(action=self.simulate_error) else: for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform CRUDs with induced error scenario is active load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim.revert(self.simulate_error) else: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats got updated" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local methods to validate vb_seqno def compare_vb_stat(stat_1, stat_2, vb, comparison="!="): keys_to_check = ["high_seqno", "high_completed_seqno"] result = True for key in keys_to_check: if vb in stat_1.keys(): if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]: self.log_failure( "Mismatch in vb-%s UUID. %s != %s" % (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"])) if comparison == "!=": if stat_1[vb][key] != stat_2[vb][key]: result = False self.log.warning( "Mismatch in vb-%s stat %s. %s != %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) elif stat_1[vb][key] == stat_2[vb][key]: result = False self.log.warning( "Stat not updated for vb-%s stat %s. " "%s == %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) return result def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for tem_vb_num in range(self.cluster_util.vbuckets): tem_vb_num = str(tem_vb_num) if tem_vb_num not in affected_vbs: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log_failure("Unaffected vb-%s stat" % tem_vb_num) elif int(tem_vb_num) in target_nodes_vbuckets["active"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "active", tem_vb_num)) elif int(tem_vb_num) in target_nodes_vbuckets["replica"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num, comparison="==") is False: retry_validation = True self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "replica", tem_vb_num)) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 self.key = "test_collections" self.sdk_timeout = 3 # Select target vbucket type to load_docs target_vb_type = "replica" if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \ and self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" # Create required scope/collection for successful CRUD operation if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.collection_name = self.bucket_util.get_random_name() self.log.info("Creating scope::collection %s::%s" % (self.scope_name, self.collection_name)) self.create_scope_collection() # Load docs into created collection self.log.info("Loading data into created collection") load_gen = doc_generator(self.key, 0, self.num_items) task = self.task.async_load_gen_docs( self.cluster, self.bucket, load_gen, "create", 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=200, process_concurrency=8, timeout_secs=60) self.task_manager.get_task_result(task) if self.subdoc_test: load_gen = sub_doc_generator(self.key, 0, self.num_items / 2) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, load_gen, Bucket_Op.SubDocOps.INSERT, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, path_create=True, batch_size=100, process_concurrency=8, durability=self.durability_level, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(task) self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items = self.num_items target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout if target_vb_type == "active": target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["replica"]))) else: target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["active"]))) # Create required doc_generators doc_gen["create"] = doc_generator(self.key, self.num_items, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["delete"] = doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["read"] = doc_generator(self.key, int(self.num_items / 3), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["update"] = doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) # Create required subdoc generators doc_gen["insert"] = sub_doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, template_index=1, target_vbucket=target_vbs) doc_gen["remove"] = sub_doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error_simulation to take effect") ops_to_perform = [ Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE, Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE ] if self.subdoc_test: ops_to_perform = [ Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT, Bucket_Op.SubDocOps.REMOVE ] for op_type in ops_to_perform: self.log.info("Starting doc op %s" % op_type) if op_type in Bucket_Op.DOC_OPS: tasks[op_type] = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, suppress_error_table=True, print_ops_rate=False, skip_read_on_error=True) else: tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, path_create=True, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False) self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == Bucket_Op.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in ops_to_perform: if op_type == Bucket_Op.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets))) affected_vbs = list(set(affected_vbs)) # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # Get SDK Client from client_pool sdk_client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) # Doc error validation for op_type in ops_to_perform: task = tasks[op_type] if self.nodes_init == 1 \ and op_type != Bucket_Op.DocOps.READ \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.info) table_view.set_headers(["Key", "vBucket", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if SDKException.DurabilityAmbiguousException \ not in str(doc_info["error"]): table_view.add_row( [doc_key, vb_for_key, doc_info["error"]]) ambiguous_table_view.add_row([doc_key, str(vb_for_key)]) if op_type not in Bucket_Op.SUB_DOC_OPS: retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("D_Ambiguous exception during %s" % op_type) # Release the acquired client self.sdk_client_pool.release_client(sdk_client) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_timeout_with_successful_crud(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side. This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operation succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() vb_info["init"] = dict() vb_info["afterCrud"] = dict() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) doc_load_spec = dict() doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = self.sdk_timeout doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] = \ "test_collections" doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 0 ops_to_perform = ["create", "update", "read", "replace", "delete"] if self.subdoc_test: ops_to_perform = ["insert", "upsert", "remove"] for op_type in ops_to_perform: self.log.info("Performing '%s' with timeout=%s" % (op_type, self.sdk_timeout)) curr_spec = deepcopy(doc_load_spec) if op_type == "create": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "update": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "delete": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "read": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 5 curr_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.TimeoutException ] elif op_type == "insert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 5 elif op_type == "upsert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 5 elif op_type == "remove": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 5 doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, curr_spec, mutation_num=1, async_load=True, validate_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(10, "Wait before reverting the error condition") # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.fail("Doc_loading for '%s' failed" % op_type) # Fetch latest stats and validate the values are updated for node in target_nodes: curr_stat = cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == curr_stat: self.log_failure("vbucket_seqno not updated. %s == %s" % (vb_info["init"][node.ip], curr_stat)) # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()