def setUp(self): super(SubDocTimeouts, self).setUp() # Loading SubDocs to loaded documents self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = sub_doc_generator(self.key, 0, self.num_items / 2, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster.vbuckets) self.log.info("Loading {0} Sub-docs into the bucket: {1}".format( self.num_items / 2, self.bucket)) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_create, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) self.log.info("==========Finished SubDocFailures base setup========")
def test_sync_write_in_progress(self): """ Test to simulate sync_write_in_progress error and validate the behavior This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select nodes to simulate the error which will affect the durability 2. Enable the specified error_scenario on the selected nodes 3. Perform individual CRUDs and verify sync_write_in_progress errors 4. Validate the end results """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() replica_vbs = dict() vb_info["init"] = dict() # Variable to hold one of the doc_generator objects gen_loader = [None, None] doc_loader_task_1 = None doc_loader_task_2 = None # Override the crud_batch_size self.crud_batch_size = 5 expected_failed_doc_num = self.crud_batch_size # Select nodes to affect and open required shell_connections target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbuckets = replica_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(replica_vbs[target_nodes[index].ip]))) index += 1 # Initialize doc_generators to use for testing self.log.info("Creating doc_generators") gen_create = doc_generator(self.key, self.num_items, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_update_delete = doc_generator(self.key, 0, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets, mutate=1) gen_subdoc = sub_doc_generator(self.key, 0, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) self.log.info("Done creating doc_generators") inital_num_items = self.num_items # Start CRUD operation based on the given 'doc_op' type if self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: self.num_items += self.crud_batch_size gen_loader[0] = gen_create elif self.doc_ops[0] in DocLoading.Bucket.DocOps.UPDATE: gen_loader[0] = gen_update_delete elif self.doc_ops[0] == DocLoading.Bucket.DocOps.DELETE: gen_loader[0] = gen_update_delete self.num_items -= self.crud_batch_size elif self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: gen_loader[0] = gen_subdoc if self.doc_ops[1] == DocLoading.Bucket.DocOps.CREATE: gen_loader[1] = gen_create elif self.doc_ops[1] in [ DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: gen_loader[1] = gen_update_delete elif self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: if self.doc_ops[1] == DocLoading.Bucket.SubDocOps.INSERT \ and self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: gen_subdoc = sub_doc_generator(self.key, inital_num_items, self.crud_batch_size, key_size=self.key_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_loader[1] = gen_subdoc gen_loader[1] = gen_subdoc # Load task for further upsert / remove operations if (self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]) or (self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]): subdoc_load_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc, DocLoading.Bucket.SubDocOps.INSERT, path_create=True, batch_size=self.crud_batch_size, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(subdoc_load_task) tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = "NONE" # Initialize tasks and store the task objects if self.doc_ops[0] in [ DocLoading.Bucket.DocOps.CREATE, DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: doc_loader_task_1 = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[0], self.doc_ops[0], 0, batch_size=1, process_concurrency=self.crud_batch_size, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, start_task=False) elif self.doc_ops[0] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: doc_loader_task_1 = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_loader[0], self.doc_ops[0], 0, path_create=True, batch_size=1, process_concurrency=self.crud_batch_size, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, start_task=False) # This will support both sync-write and non-sync-writes if self.doc_ops[1] in [ DocLoading.Bucket.DocOps.CREATE, DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.DELETE ]: doc_loader_task_2 = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[1], self.doc_ops[1], 0, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=5, task_identifier="parallel_task2", print_ops_rate=False, start_task=False) elif self.doc_ops[1] in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: doc_loader_task_2 = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_loader[1], self.doc_ops[1], 0, path_create=True, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=5, task_identifier="parallel_task2", print_ops_rate=False, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error simulation to take effect") # Start the loader_task_1 self.task_manager.add_new_task(doc_loader_task_1) self.sleep(10, "Wait for task_1 CRUDs to reach server") # Start the loader_task_2 self.task_manager.add_new_task(doc_loader_task_2) # This task should be done will all sync_write_in_progress errors self.task.jython_task_manager.get_task_result(doc_loader_task_2) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loader_task_1 to complete self.task.jython_task_manager.get_task_result(doc_loader_task_1) # Validation to verify the sync_in_write_errors in doc_loader_task_2 failed_docs = doc_loader_task_2.fail if len(failed_docs.keys()) != expected_failed_doc_num: self.log_failure( "Exception not seen for few docs: {0}".format(failed_docs)) expected_exception = SDKException.AmbiguousTimeoutException retry_reason = SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if self.doc_ops[0] in DocLoading.Bucket.DocOps.CREATE: expected_exception = SDKException.DocumentNotFoundException retry_reason = None valid_exception = self.durability_helper.validate_durability_exception( failed_docs, expected_exception, retry_reason=retry_reason) if not valid_exception: self.log_failure("Got invalid exception") # Validate docs for update success or not if self.doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE: read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_loader[0], DocLoading.Bucket.DocOps.READ, batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 \ and json.loads(str(doc_info["value"]))["mutated"] != 1: self.log_failure("Update failed for key %s: %s" % (key, doc_info)) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.validate_test_failure()
def test_timeout_with_successful_crud(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side. This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operation succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() doc_gen = dict() vb_info = dict() vb_info["init"] = dict() vb_info["afterCrud"] = dict() vb_info["withinTimeout"] = dict() target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) doc_gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size) doc_gen["read"] = sub_doc_generator(self.key, self.num_items / 4, self.num_items / 2, key_size=self.key_size) doc_gen["upsert"] = sub_doc_generator_for_edit(self.key, self.num_items / 4, self.num_items / 2, key_size=self.key_size, template_index=2) doc_gen["remove"] = sub_doc_generator_for_edit(self.key, 0, self.num_items / 4, key_size=self.key_size, template_index=2) for op_type in doc_gen.keys(): self.log.info("Performing '%s' with timeout=%s" % (op_type, self.sdk_timeout)) doc_load_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, self.maxttl, path_create=True, batch_size=500, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait before reverting the error condition") # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) self.task_manager.get_task_result(doc_load_task) if len(doc_load_task.fail.keys()) != 0: if op_type == "read": self.log.warning("Read failed for %d keys: %s" % (len( doc_load_task.fail.keys()), doc_load_task.fail.keys())) else: self.log_failure("Failures during %s operation: %s" % (op_type, doc_load_task.fail)) # Fetch latest stats and validate the values are updated for node in target_nodes: if op_type == "read": continue vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure( "vbucket_seqno not updated. {0} == {1}".format( vb_info["init"][node.ip], vb_info["afterCrud"][node.ip])) # # Retry failed docs (if any) # retry_failed = self.durability_helper.retry_with_no_error( # client, doc_load_task.fail, op_type) # if retry_failed: # self.log_failure(msg.format(op_type)) # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Read mutation field from all docs for validation gen_read = sub_doc_generator_for_edit(self.key, 0, self.num_items, key_size=self.key_size) gen_read.template = '{{ "mutated": "" }}' reader_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_read, "read", batch_size=50, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(reader_task) len_failed_keys = len(reader_task.fail.keys()) if len_failed_keys != 0: self.log_failure("Failures in read_task (%d): %s" % (len_failed_keys, reader_task.fail.keys())) for doc_key, crud_result in reader_task.success.items(): expected_val = 2 if int(doc_key.split('-')[1]) >= self.num_items / 2: expected_val = 1 if reader_task.success[doc_key]["value"][0] != expected_val: self.log_failure("Value mismatch for %s: %s" % (doc_key, crud_result)) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.validate_test_failure()
def test_crud_failures(self): """ Test to configure the cluster in such a way durability will always fail 1. Try creating the docs with durability set 2. Verify create failed with durability_not_possible exception 3. Create docs using async_writes 4. Perform update and delete ops with durability 5. Make sure these ops also fail with durability_not_possible exception """ def validate_doc_mutated_value(expected_val): reader_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, subdoc_reader_gen, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(reader_task) for doc_id, read_result in reader_task.success.items(): if int(read_result["value"][0]) != int(expected_val): self.log_failure( "Key %s - mutated value is %s != %s" % (doc_id, read_result["value"], expected_val)) tasks = list() vb_info = dict() shell_conn = dict() cbstat_obj = dict() vb_info["init"] = dict() vb_info["failure_stat"] = dict() vb_info["create_stat"] = dict() nodes_in_cluster = self.cluster_util.get_kv_nodes(self.cluster) gen_load = doc_generator(self.key, 0, self.num_items) gen_subdoc_load = sub_doc_generator(self.key, 0, self.num_items, key_size=self.key_size) subdoc_reader_gen = sub_doc_generator(self.key, 0, self.num_items, key_size=self.key_size) subdoc_reader_gen.template = '{{ "mutated": "" }}' err_msg = "Doc mutation succeeded with, " \ "cluster size: {0}, replica: {1}" \ .format(len(self.cluster.nodes_in_cluster), self.num_replicas) d_impossible_exception = SDKException.DurabilityImpossibleException # Load basic documents without durability for validating SubDocs create_task = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_load, "create", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(create_task) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) for node in nodes_in_cluster: shell_conn[node.ip] = \ RemoteMachineShellConnection(self.cluster.master) cbstat_obj[node.ip] = Cbstats(node) # Fetch vbucket seq_no stats from vb_seqno command for verification vb_info["init"].update(cbstat_obj[node.ip].vbucket_seqno( self.bucket.name)) # MB-34064 - Try same CREATE twice to validate doc cleanup in server for _ in range(2): # Perform durable SET operation d_create_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc_load, DocLoading.Bucket.SubDocOps.INSERT, path_create=True, batch_size=10, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(d_create_task) # Fetch vbucket seq_no status from cbstats after CREATE task for node in nodes_in_cluster: vb_info["failure_stat"].update( cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)) self.assertTrue(len(d_create_task.fail.keys()) == self.num_items, msg=err_msg) if vb_info["init"] != vb_info["failure_stat"]: self.log_failure("Failover stats mismatch. {0} != {1}".format( vb_info["init"], vb_info["failure_stat"])) validation_passed = \ self.durability_helper.validate_durability_exception( d_create_task.fail, d_impossible_exception) if not validation_passed: self.log_failure("Unexpected exception type") validate_doc_mutated_value(0) # Perform aync_write to create the documents async_create_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc_load, DocLoading.Bucket.SubDocOps.INSERT, path_create=True, batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(async_create_task) if len(async_create_task.fail.keys()) != 0: self.log_failure("Few failures during async_create(%d): %s" % (len( async_create_task.fail.keys()), async_create_task.fail.keys())) validate_doc_mutated_value(1) # Verify doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Fetch vbucket seq_no status from vb_seqno command after async CREATEs for node in nodes_in_cluster: vb_info["create_stat"].update(cbstat_obj[node.ip].vbucket_seqno( self.bucket.name)) # Start durable UPDATE operation tasks.append( self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc_load, DocLoading.Bucket.SubDocOps.UPSERT, path_create=True, batch_size=10, process_concurrency=4, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Start durable DELETE operation tasks.append( self.task.async_load_gen_sub_docs( self.cluster, self.bucket, gen_subdoc_load, DocLoading.Bucket.SubDocOps.REMOVE, batch_size=10, process_concurrency=4, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Wait for all tasks to complete and validate the exception for task in tasks: self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != self.num_items: self.log_failure( "Few keys have not received exceptions: {0}".format( task.fail.keys())) validation_passed = \ self.durability_helper.validate_durability_exception( task.fail, d_impossible_exception) if not validation_passed: self.log_failure("Unexpected exception type") # Verify doc count is unchanged due to durability failures self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) validate_doc_mutated_value(1) # Reset failure_stat dictionary for reuse vb_info["failure_stat"] = dict() # Fetch vbucket seq_no status from vb_seqno after UPDATE/DELETE task for node in nodes_in_cluster: vb_info["failure_stat"].update(cbstat_obj[node.ip].vbucket_seqno( self.bucket.name)) if vb_info["create_stat"] != vb_info["failure_stat"]: self.log_failure("Failover stats mismatch. {0} != {1}".format( vb_info["failure_stat"], vb_info["create_stat"])) self.validate_test_failure()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local method to validate vb_seqno def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for vb_id in range(self.cluster.vbuckets): vb_id = str(vb_id) if vb_id not in affected_vbs: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log_failure( "Unaffected vb-%s stat updated: %s != %s" % (vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.ACTIVE]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ != vb_info["post_timeout"][node.ip][vb_id]: self.log.warning( err_msg % (node.ip, Bucket.vBucket.ACTIVE, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) elif int(vb_id) \ in target_nodes_vbuckets[Bucket.vBucket.REPLICA]: if vb_id in vb_info["init"][node.ip].keys() \ and vb_info["init"][node.ip][vb_id] \ == vb_info["post_timeout"][node.ip][vb_id]: retry_validation = True self.log.warning( err_msg % (node.ip, Bucket.vBucket.REPLICA, vb_id, vb_info["init"][node.ip][vb_id], vb_info["post_timeout"][node.ip][vb_id])) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets[Bucket.vBucket.ACTIVE] = list() target_nodes_vbuckets[Bucket.vBucket.REPLICA] = list() vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) target_nodes_vbuckets[Bucket.vBucket.ACTIVE] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.ACTIVE) target_nodes_vbuckets[Bucket.vBucket.REPLICA] += \ cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type=Bucket.vBucket.REPLICA) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout target_vbs = target_nodes_vbuckets[Bucket.vBucket.ACTIVE] if self.nodes_init == 1: pass elif self.durability_level \ == Bucket.DurabilityLevel.PERSIST_TO_MAJORITY: target_vbs = target_nodes_vbuckets[Bucket.vBucket.REPLICA] # Create required doc_generators doc_gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.crud_batch_size, target_vbucket=target_vbs, key_size=self.key_size) doc_gen["remove"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=2, target_vbucket=target_vbs) doc_gen["read"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, key_size=self.key_size, template_index=0, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, int(self.num_items / 4), self.crud_batch_size, key_size=self.key_size, template_index=1, target_vbucket=target_vbs) for op_type in doc_gen.keys(): tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) for op_type in doc_gen.keys(): self.task_manager.add_new_task(tasks[op_type]) # Wait for document_loader tasks to complete for op_type in doc_gen.keys(): self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == DocLoading.Bucket.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in doc_gen.keys(): if op_type == DocLoading.Bucket.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster.vbuckets))) affected_vbs = list(set(affected_vbs)) err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s" # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # If replicas+1 == total nodes, verify no mutation should have # succeeded with durability if self.nodes_init == self.num_replicas + 1: read_gen = doc_generator(self.key, 0, self.num_items) read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, read_gen, DocLoading.Bucket.DocOps.READ, 0, batch_size=500, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) failed_keys = TableView(self.log.error) failed_keys.set_headers(["Key", "Error"]) half_of_num_items = self.num_items / 2 for doc_key, doc_info in read_task.success.items(): key_index = int(doc_key.split("-")[1]) expected_mutated_val = 0 if key_index < half_of_num_items: expected_mutated_val = 1 mutated = json.loads(str(doc_info["value"]))["mutated"] if mutated != expected_mutated_val: failed_keys.add_row([doc_key, doc_info]) failed_keys.display("Affected mutations:") self.log.error(read_task.fail) # Doc error validation for op_type in doc_gen.keys(): task = tasks[op_type] retry_task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(retry_task) retry_failures = set(retry_task.fail.keys()) initial_failures = set(task.fail.keys()) if len(list(retry_failures.difference(initial_failures))) != 0: self.log_failure("Docs failed during retry task for %s: %s" % (op_type, retry_task.fail)) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local methods to validate vb_seqno def compare_vb_stat(stat_1, stat_2, vb, comparison="!="): keys_to_check = ["high_seqno", "high_completed_seqno"] result = True for key in keys_to_check: if vb in stat_1.keys(): if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]: self.log_failure( "Mismatch in vb-%s UUID. %s != %s" % (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"])) if comparison == "!=": if stat_1[vb][key] != stat_2[vb][key]: result = False self.log.warning( "Mismatch in vb-%s stat %s. %s != %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) elif stat_1[vb][key] == stat_2[vb][key]: result = False self.log.warning( "Stat not updated for vb-%s stat %s. " "%s == %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) return result def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for tem_vb_num in range(self.cluster_util.vbuckets): tem_vb_num = str(tem_vb_num) if tem_vb_num not in affected_vbs: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log_failure("Unaffected vb-%s stat" % tem_vb_num) elif int(tem_vb_num) in target_nodes_vbuckets["active"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "active", tem_vb_num)) elif int(tem_vb_num) in target_nodes_vbuckets["replica"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num, comparison="==") is False: retry_validation = True self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "replica", tem_vb_num)) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 self.key = "test_collections" self.sdk_timeout = 3 # Select target vbucket type to load_docs target_vb_type = "replica" if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \ and self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" # Create required scope/collection for successful CRUD operation if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.collection_name = self.bucket_util.get_random_name() self.log.info("Creating scope::collection %s::%s" % (self.scope_name, self.collection_name)) self.create_scope_collection() # Load docs into created collection self.log.info("Loading data into created collection") load_gen = doc_generator(self.key, 0, self.num_items) task = self.task.async_load_gen_docs( self.cluster, self.bucket, load_gen, "create", 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=200, process_concurrency=8, timeout_secs=60) self.task_manager.get_task_result(task) if self.subdoc_test: load_gen = sub_doc_generator(self.key, 0, self.num_items / 2) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, load_gen, Bucket_Op.SubDocOps.INSERT, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, path_create=True, batch_size=100, process_concurrency=8, durability=self.durability_level, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(task) self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items = self.num_items target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout if target_vb_type == "active": target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["replica"]))) else: target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["active"]))) # Create required doc_generators doc_gen["create"] = doc_generator(self.key, self.num_items, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["delete"] = doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["read"] = doc_generator(self.key, int(self.num_items / 3), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["update"] = doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) # Create required subdoc generators doc_gen["insert"] = sub_doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, template_index=1, target_vbucket=target_vbs) doc_gen["remove"] = sub_doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error_simulation to take effect") ops_to_perform = [ Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE, Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE ] if self.subdoc_test: ops_to_perform = [ Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT, Bucket_Op.SubDocOps.REMOVE ] for op_type in ops_to_perform: self.log.info("Starting doc op %s" % op_type) if op_type in Bucket_Op.DOC_OPS: tasks[op_type] = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, suppress_error_table=True, print_ops_rate=False, skip_read_on_error=True) else: tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, path_create=True, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False) self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == Bucket_Op.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in ops_to_perform: if op_type == Bucket_Op.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets))) affected_vbs = list(set(affected_vbs)) # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # Get SDK Client from client_pool sdk_client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) # Doc error validation for op_type in ops_to_perform: task = tasks[op_type] if self.nodes_init == 1 \ and op_type != Bucket_Op.DocOps.READ \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.info) table_view.set_headers(["Key", "vBucket", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if SDKException.DurabilityAmbiguousException \ not in str(doc_info["error"]): table_view.add_row( [doc_key, vb_for_key, doc_info["error"]]) ambiguous_table_view.add_row([doc_key, str(vb_for_key)]) if op_type not in Bucket_Op.SUB_DOC_OPS: retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("D_Ambiguous exception during %s" % op_type) # Release the acquired client self.sdk_client_pool.release_client(sdk_client) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_with_persistence_issues(self): """ 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations met the durability condition """ if self.durability_level.upper() in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] insert_end_index = self.num_items / 3 upsert_end_index = (self.num_items / 3) * 2 self.log.info("Selecting nodes to simulate error condition") target_nodes = self.getTargetNodes() self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Load sub_docs for upsert/remove mutation to work sub_doc_gen = sub_doc_generator(self.key, start=insert_end_index, end=self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=20, process_concurrency=8, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(task) # Perform CRUDs with induced error scenario is active tasks = list() gen_create = sub_doc_generator(self.key, 0, insert_end_index, key_size=self.key_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) gen_update = sub_doc_generator_for_edit( self.key, insert_end_index, upsert_end_index, key_size=self.key_size, template_index=0, target_vbucket=self.target_vbucket) gen_delete = sub_doc_generator_for_edit( self.key, upsert_end_index, self.num_items, key_size=self.key_size, template_index=2, target_vbucket=self.target_vbucket) self.log.info("Starting parallel doc_ops - insert/Read/upsert/remove") tasks.append( self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen_create, DocLoading.Bucket.SubDocOps.INSERT, 0, path_create=True, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) tasks.append( self.task.async_load_gen_sub_docs(self.cluster, def_bucket, gen_update, "read", 0, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) tasks.append( self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen_update, DocLoading.Bucket.SubDocOps.UPSERT, 0, path_create=True, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) tasks.append( self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen_delete, DocLoading.Bucket.SubDocOps.REMOVE, 0, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Wait for document_loader tasks to complete for task in tasks: self.task.jython_task_manager.get_task_result(task) # Verify there is not failed docs in the task if len(task.fail.keys()) != 0: self.log_failure("Some CRUD failed for {0}".format(task.fail)) # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] self.assertTrue(val, msg="Failover stats got updated") # Seq_no validation (High level) val = vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Verify doc count self.log.info("Validating doc count") self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.validate_test_failure()
def test_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() target_vbuckets = range(0, self.cluster_util.vbuckets) active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] insert_end_index = self.num_items / 3 upsert_end_index = (self.num_items / 3) * 2 self.log.info("Selecting nodes to simulate error condition") target_nodes = self.getTargetNodes() self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Load sub_docs for upsert/remove mutation to work sub_doc_gen = sub_doc_generator(self.key, start=0, end=self.num_items / 2, key_size=self.key_size, doc_size=self.sub_doc_size) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=20, process_concurrency=8, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(task) for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Remove active vbuckets from doc_loading to avoid errors target_vbuckets = list( set(target_vbuckets) ^ set(active_vbs_in_target_nodes)) # Perform CRUDs with induced error scenario is active tasks = dict() gen = dict() gen["insert"] = sub_doc_generator(self.key, self.num_items / 2, self.crud_batch_size, key_size=self.key_size, target_vbucket=target_vbuckets) gen["read"] = sub_doc_generator_for_edit( self.key, self.num_items / 4, 50, key_size=self.key_size, template_index=0, target_vbucket=target_vbuckets) gen["upsert"] = sub_doc_generator_for_edit( self.key, self.num_items / 4, 50, key_size=self.key_size, template_index=0, target_vbucket=target_vbuckets) gen["remove"] = sub_doc_generator_for_edit( self.key, 0, 50, key_size=self.key_size, template_index=2, target_vbucket=target_vbuckets) self.log.info("Starting parallel doc_ops - insert/Read/upsert/remove") tasks["insert"] = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen["insert"], DocLoading.Bucket.SubDocOps.INSERT, 0, path_create=True, batch_size=1, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, print_ops_rate=False, timeout_secs=self.sdk_timeout) tasks["read"] = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen["read"], "read", 0, batch_size=1, process_concurrency=1, print_ops_rate=False, timeout_secs=self.sdk_timeout) tasks["upsert"] = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen["upsert"], DocLoading.Bucket.SubDocOps.UPSERT, 0, path_create=True, batch_size=1, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, print_ops_rate=False, timeout_secs=self.sdk_timeout) tasks["remove"] = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen["remove"], DocLoading.Bucket.SubDocOps.REMOVE, 0, batch_size=1, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, print_ops_rate=False, timeout_secs=self.sdk_timeout) # Wait for document_loader tasks to complete for _, task in tasks.items(): self.task_manager.get_task_result(task) # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Read mutation field from all docs for validation gen_read = sub_doc_generator_for_edit(self.key, 0, self.num_items, 0, key_size=self.key_size) gen_read.template = '{{ "mutated": "" }}' reader_task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, gen_read, "read", key_size=self.key_size, batch_size=50, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(reader_task) # Validation for each CRUD task for op_type, task in tasks.items(): if len(task.success.keys()) != len(gen[op_type].doc_keys): self.log_failure("Failure during %s operation" % op_type) elif len(task.fail.keys()) != 0: self.log_failure("Some CRUD failed during %s: %s" % (op_type, task.fail)) for doc_key, crud_result in task.success.items(): if crud_result["cas"] == 0: self.log_failure("%s failed for %s: %s" % (op_type, doc_key, crud_result)) if op_type == DocLoading.Bucket.SubDocOps.INSERT: if reader_task.success[doc_key]["value"][0] != 1: self.log_failure("%s value mismatch for %s: %s" % (op_type, doc_key, crud_result)) elif op_type in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: if reader_task.success[doc_key]["value"][0] != 2: self.log_failure("%s value mismatch for %s: %s" % (op_type, doc_key, crud_result)) # Verify there is not failed docs in the task # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats not updated after error condition" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Verify doc count self.log.info("Validating doc count") self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.validate_test_failure()
def test_non_overlapping_parallel_cruds(self): """ Test to run non-overlapping durability cruds on single bucket and make sure all CRUD operation succeeds 1. Run single task_1 with durability operation 2. Create parallel task to run either SyncWrite / Non-SyncWrite operation based on the config param and run that over the docs such that it will not overlap with the other tasks 3. Make sure all CRUDs succeeded without any unexpected exceptions """ doc_ops = self.input.param("doc_ops", "insert;upsert;remove;read") doc_ops = doc_ops.split(";") doc_gen = dict() sub_doc_gen = dict() tasks = list() insert_end_index = self.num_items / 3 upsert_end_index = (self.num_items / 3) * 2 def_bucket = self.bucket_util.buckets[0] # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = 0 verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 # Load sub_docs for upsert/remove to work curr_doc_gen = sub_doc_generator(self.key, insert_end_index, self.num_items, key_size=self.key_size) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, curr_doc_gen, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += \ (curr_doc_gen.end - curr_doc_gen.start) if self.durability_level: verification_dict["sync_write_committed_count"] += \ (curr_doc_gen.end - curr_doc_gen.start) # Create required doc_generators for CRUD ops doc_gen["create"] = doc_generator(self.key, self.num_items, self.num_items * 2, doc_size=self.doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) doc_gen["read"] = doc_generator(self.key, 0, self.num_items) # Create sub-doc generators for CRUD test sub_doc_gen["insert"] = sub_doc_generator(self.key, start=0, end=insert_end_index, key_size=self.key_size, doc_size=self.sub_doc_size) sub_doc_gen["read"] = sub_doc_generator(self.key, start=insert_end_index, end=upsert_end_index, key_size=self.key_size, doc_size=self.sub_doc_size) sub_doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, start=insert_end_index, end=upsert_end_index, template_index=0, key_size=self.key_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) sub_doc_gen["remove"] = sub_doc_generator_for_edit( self.key, start=upsert_end_index, end=self.num_items, template_index=2, key_size=self.key_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) # Start full document mutations before starting sub_doc ops tasks.append( self.task.async_load_gen_docs(self.cluster, def_bucket, doc_gen["create"], "create", 0, batch_size=10, process_concurrency=1, durability=self.durability_level, timeout_secs=self.sdk_timeout)) tasks.append( self.task.async_load_gen_docs(self.cluster, def_bucket, doc_gen["read"], "read", 0, batch_size=10, process_concurrency=1, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Start Sub_document mutations for index in range(0, 4): op_type = doc_ops[index] curr_doc_gen = sub_doc_gen[op_type] mutation_count = curr_doc_gen.end - curr_doc_gen.start if op_type != "read": verification_dict["ops_update"] += mutation_count if index < 2: # Durability doc_loader for first two ops specified in doc_ops tasks.append( self.task.async_load_gen_sub_docs( self.cluster, def_bucket, curr_doc_gen, op_type, 0, path_create=True, batch_size=10, process_concurrency=1, durability=self.durability_level, timeout_secs=self.sdk_timeout)) if op_type != "read" and self.durability_level: verification_dict["sync_write_committed_count"] += \ mutation_count else: # Non-SyncWrites for last two ops specified in doc_ops tasks.append( self.task.async_load_gen_sub_docs( self.cluster, def_bucket, curr_doc_gen, op_type, 0, path_create=True, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout)) # Wait for all task to complete for task in tasks: self.task.jython_task_manager.get_task_result(task) # Update num_items to sync with new docs created self.num_items *= 2 verification_dict["ops_create"] = self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] += \ self.num_items # Verify doc count and other stats self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Verify vb-details cbstats failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed")
def test_non_overlapping_similar_crud(self): """ Test to run non-overlapping durability cruds on single bucket and make sure all CRUD operation succeeds 1. Run single task_1 with durability operation 2. Create parallel task to run either SyncWrite / Non-SyncWrite operation based on the config param and run that over the docs such that it will not overlap with other tasks 3. Make sure all CRUDs succeeded without any unexpected exceptions """ doc_ops = self.input.param("op_type", "create") tasks = list() def_bucket = self.bucket_util.buckets[0] # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = 0 verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 self.log.info("Loading documents to support further sub_doc ops") doc_gen = doc_generator(self.key, self.num_items, self.num_items * 2, doc_size=self.doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) task = self.task.async_load_gen_docs(self.cluster, def_bucket, doc_gen, "create", self.maxttl, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) half_of_num_items = self.num_items self.num_items *= 2 # Update verification_dict and validate verification_dict["ops_create"] = self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] = self.num_items self.log.info("Validating doc_count") self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Create required doc_generators for CRUD ops doc_gen = dict() read_gen = doc_generator(self.key, 0, self.num_items) if doc_ops == DocLoading.Bucket.SubDocOps.INSERT: doc_gen[0] = sub_doc_generator(self.key, 0, half_of_num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) doc_gen[1] = sub_doc_generator(self.key, half_of_num_items, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) elif doc_ops in [ DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: self.log.info("Creating sub_docs before upsert/remove operation") sub_doc_gen = sub_doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) template_index_1 = 0 template_index_2 = 1 if doc_ops == DocLoading.Bucket.SubDocOps.REMOVE: template_index_1 = 2 template_index_2 = 2 task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] += \ self.num_items doc_gen[0] = sub_doc_generator_for_edit( self.key, start=0, end=half_of_num_items, key_size=self.key_size, template_index=template_index_1, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) doc_gen[1] = sub_doc_generator_for_edit( self.key, start=half_of_num_items, end=self.num_items, key_size=self.key_size, template_index=template_index_2, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) else: self.fail("Invalid sub_doc operation '%s'" % doc_ops) # Sync_Writes for doc_ops[0] tasks.append( self.task.async_load_gen_sub_docs(self.cluster, def_bucket, doc_gen[0], doc_ops, self.maxttl, path_create=True, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Non_SyncWrites for doc_ops[1] tasks.append( self.task.async_load_gen_sub_docs(self.cluster, def_bucket, doc_gen[1], doc_ops, self.maxttl, path_create=True, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Generic reader task - reads entire document instead of sub-doc tasks.append( self.task.async_load_gen_docs(self.cluster, def_bucket, read_gen, "read", 0, batch_size=10, process_concurrency=1, timeout_secs=self.sdk_timeout)) verification_dict["ops_update"] += self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] += self.num_items # Wait for all task to complete for task in tasks: self.task.jython_task_manager.get_task_result(task) # Verify doc count and other stats self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed")
def test_basic_ops(self): """ Basic test for Sub-doc CRUD operations """ doc_op = self.input.param("op_type", None) def_bucket = self.bucket_util.buckets[0] supported_d_levels = self.bucket_util.get_supported_durability_levels() # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += self.num_items # Initial validation failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = sub_doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) self.log.info("Loading {0} docs into the bucket: {1}".format( self.num_items, def_bucket)) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, doc_create, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Update verification_dict and validate verification_dict["ops_update"] += self.num_items if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += self.num_items failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.verify_stats_all_buckets(self.num_items) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) template_index = 0 if doc_op == DocLoading.Bucket.SubDocOps.REMOVE: template_index = 2 sub_doc_gen = sub_doc_generator_for_edit(self.key, start=0, end=num_item_start_for_crud, key_size=self.key_size, template_index=template_index) if doc_op == DocLoading.Bucket.SubDocOps.UPSERT: self.log.info("Performing 'upsert' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += \ (sub_doc_gen.end - sub_doc_gen.start + len(task.fail.keys())) if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = \ sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Update failed key", "Value"]) for key, value in task.success.items(): doc_value = value["value"] failed_row = [key, doc_value] if doc_value[0] != 2: op_failed_tbl.add_row(failed_row) elif doc_value[1] != "LastNameUpdate": op_failed_tbl.add_row(failed_row) elif doc_value[2] != "TypeChange": op_failed_tbl.add_row(failed_row) elif doc_value[3] != "CityUpdate": op_failed_tbl.add_row(failed_row) elif json.loads(str(doc_value[4])) != ["get", "up"]: op_failed_tbl.add_row(failed_row) op_failed_tbl.display("Update failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Update failed for few keys") elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE: self.log.info("Performing 'remove' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += \ (sub_doc_gen.end - sub_doc_gen.start + len(task.fail.keys())) if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = sub_doc_gen.template \ .replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Delete failed key", "Value"]) for key, value in task.success.items(): doc_value = value["value"] failed_row = [key, doc_value] if doc_value[0] != 2: op_failed_tbl.add_row(failed_row) for index in range(1, len(doc_value)): if doc_value[index] != "PATH_NOT_FOUND": op_failed_tbl.add_row(failed_row) for key, value in task.fail.items(): op_failed_tbl.add_row([key, value["value"]]) op_failed_tbl.display("Delete failed for keys:") if len(op_failed_tbl.rows) != 0: self.fail("Delete failed for few keys") else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Validate verification_dict and validate failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.verify_stats_all_buckets(self.num_items)
def test_basic_ops(self): """ Basic test for Sub-doc CRUD operations A test in which `self.num_items` documents are created. Half of the documents are updated or deleted depending on the supplied `op_type`. """ doc_op = self.input.param("op_type", None) def_bucket = self.cluster.buckets[0] # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += self.num_items # Initial validation failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Insert `self.num_items` documents doc_create = sub_doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.sub_doc_size, target_vbucket=self.target_vbucket, vbuckets=self.cluster.vbuckets) self.log.info("Loading {0} docs into the bucket: {1}" .format(self.num_items, def_bucket)) task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, doc_create, DocLoading.Bucket.SubDocOps.INSERT, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) # The documents that could not be inserted insert_failures = len(task.fail.keys()) # Update verification_dict and validate verification_dict["ops_update"] += self.num_items - insert_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += self.num_items - insert_failures verification_dict["sync_write_aborted_count"] += insert_failures failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) template_index = 0 if doc_op == DocLoading.Bucket.SubDocOps.REMOVE: template_index = 2 sub_doc_gen = sub_doc_generator_for_edit( self.key, start=0, end=num_item_start_for_crud, key_size=self.key_size, template_index=template_index) if doc_op == DocLoading.Bucket.SubDocOps.UPSERT: self.log.info("Performing 'upsert' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, self.maxttl, path_create=True, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # The documents keys for which the update failed update_failures = len(task.fail.keys()) verification_dict["ops_update"] += \ num_item_start_for_crud - update_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud - update_failures # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = \ sub_doc_gen.template.replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # A set of expected values following a read operation expected_values = {'StateUpdate', 2, 'LastNameUpdate', 'TypeChange', 'CityUpdate', 'FirstNameUpdate'} op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Update failed key", "Value"]) # If the values of attributes does not match the # expected value, append op to list of failed ops. for key, value in task.success.items(): if expected_values != set(value["value"]): op_failed_tbl.add_row([key, value["value"]]) op_failed_tbl.display("Update failed for keys:") # Expect the non-updated values to match the update failures self.assertEqual(len(op_failed_tbl.rows), update_failures, "") elif doc_op == DocLoading.Bucket.SubDocOps.REMOVE: self.log.info("Performing 'remove' mutation over the sub-docs") task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, doc_op, 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) # The number of documents that could not be removed remove_failures = len(task.fail.keys()) verification_dict["ops_update"] += \ num_item_start_for_crud - remove_failures if self.is_sync_write_enabled: verification_dict["sync_write_committed_count"] += \ num_item_start_for_crud - remove_failures # Edit doc_gen template to read the mutated value as well sub_doc_gen.template = sub_doc_gen.template \ .replace(" }}", ", \"mutated\": \"\" }}") # Read all the values to validate update operation task = self.task.async_load_gen_sub_docs( self.cluster, def_bucket, sub_doc_gen, "read", 0, batch_size=100, process_concurrency=8, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) op_failed_tbl = TableView(self.log.error) op_failed_tbl.set_headers(["Delete failed key", "Value"]) # Collect read operations that failed for key, value in task.fail.items(): op_failed_tbl.add_row([key, value["error"]]) op_failed_tbl.display("Delete succeeded for keys:") # Expect the reads to have failed indicating the sub-documents are # no longer accessible. self.assertEqual(len(op_failed_tbl.rows), num_item_start_for_crud, "Delete failed for few keys") else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) # Validate verification_dict and validate failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local method to validate vb_seqno def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for vb_num in range(self.vbuckets): vb_num = str(vb_num) if vb_num not in affected_vbs: if vb_info["init"][node.ip][vb_num] \ != vb_info["post_timeout"][node.ip][vb_num]: self.log_failure( "Unaffected vb-%s stat updated: %s != %s" % (vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) elif int(vb_num) in target_nodes_vbuckets["active"]: if vb_info["init"][node.ip][vb_num] \ != vb_info["post_timeout"][node.ip][vb_num]: self.log_failure( err_msg % (node.ip, "active", vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) elif int(vb_num) in target_nodes_vbuckets["replica"]: if vb_info["init"][node.ip][vb_num] \ == vb_info["post_timeout"][node.ip][vb_num]: retry_validation = True self.log.warning( err_msg % (node.ip, "replica", vb_num, vb_info["init"][node.ip][vb_num], vb_info["post_timeout"][node.ip][vb_num])) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 timeout_err_str = self.durability_helper.EXCEPTIONS["request_timeout"] ambiguous_err_str = self.durability_helper.EXCEPTIONS["ambiguous"] # Create required doc_generators doc_gen["insert"] = sub_doc_generator( self.key, self.num_items / 2, self.num_items / 2 + self.crud_batch_size) doc_gen["remove"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, template_index=2) doc_gen["read"] = sub_doc_generator_for_edit(self.key, 0, self.crud_batch_size, template_index=0) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, int(self.num_items / 4), int(self.num_items / 4) + self.crud_batch_size, template_index=1) target_nodes = self.getTargetNodes() for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout for op_type in doc_gen.keys(): tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, path_create=True, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, start_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) for op_type in doc_gen.keys(): self.task_manager.add_new_task(tasks[op_type]) # Wait for document_loader tasks to complete for op_type in doc_gen.keys(): self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == "read": # Validation for read task for doc_id, crud_result in tasks[op_type].success.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets) if vb_num in target_nodes_vbuckets["active"]: self.log_failure("Read succeeded for %s present in " "stopped active vbucket: %s" % (doc_id, vb_num)) self.durability_helper.validate_durability_exception( tasks[op_type].fail, self.durability_helper.EXCEPTIONS["request_timeout"]) else: # Validation of CRUDs - Update / Create / Delete if len(tasks[op_type].success.keys()) != 0: self.log_failure("Few keys succeeded for %s: %s" % (op_type, tasks[op_type].success.keys())) for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets) if vb_num in target_nodes_vbuckets["active"]: if timeout_err_str not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) else: if ambiguous_err_str not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in doc_gen.keys(): if op_type == "read": continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.vbuckets))) affected_vbs = list(set(affected_vbs)) err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s" # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # If replicas+1 == total nodes, verify no mutation should have # succeeded with durability if self.nodes_init == self.num_replicas + 1: read_gen = doc_generator(self.key, 0, self.num_items) read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, read_gen, "read", 0, batch_size=500, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) failed_keys = TableView(self.log.error) failed_keys.set_headers(["Key", "Error"]) for doc_key, doc_info in read_task.success.items(): mutated = json.loads(str(doc_info["value"]))["mutated"] if mutated != 0: failed_keys.add_row([doc_key, doc_info]) failed_keys.display("Affected mutations:") self.log.error(read_task.fail) # SDK client for retrying AMBIGUOUS for unexpected keys sdk_client = SDKClient(RestConnection(self.cluster.master), self.bucket) # Doc error validation for op_type in doc_gen.keys(): task = tasks[op_type] if self.nodes_init == 1 \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.error) table_view.set_headers(["Key", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if vb_for_key in target_nodes_vbuckets["active"]: expected_exception = \ self.durability_helper.EXCEPTIONS["request_timeout"] elif vb_for_key in target_nodes_vbuckets["replica"]: expected_exception = \ self.durability_helper.EXCEPTIONS["ambiguous"] else: expected_exception = \ self.durability_helper.EXCEPTIONS["ambiguous"] ambiguous_table_view.add_row([doc_key, vb_for_key]) retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) if expected_exception not in str(doc_info["error"]): table_view.add_row([doc_key, doc_info["error"]]) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("Ambiguous exception during %s" % op_type) # Close the SDK connection sdk_client.close() # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Retry the same CRUDs after reverting the failure environment tasks = list() for op_type in doc_gen.keys(): tasks.append( self.task.async_load_gen_docs(self.cluster, self.bucket, doc_gen[op_type], op_type, 0, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout)) # Wait for document_loader tasks to complete for task in tasks: self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure( "Failures with no error condition: {0}, {1}".format( task.fail, task.fail.keys())) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()