def induce_and_revert_failure(self, action): target_node = self.servers[-1] # select last node remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_sim.create(action) self.sleep(20, "Wait before reverting the error condition") if action in [CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_PROMETHEUS]: # Revert the simulated error condition explicitly. In kill memcached, prometheus # babysitter will bring back the process automatically error_sim.revert(action) remote.disconnect()
def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect()
def test_with_sync_write(self): cluster_node = choice(self.kv_nodes) target_vb_type, simulate_error = \ DurabilityHelper.get_vb_and_error_type(self.durability_level) doc_gen = doc_generator( self.key, 0, 2, target_vbucket=self.node_data[cluster_node]["%s_vbs" % target_vb_type]) client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) key_1, value_1 = doc_gen.next() key_2, value_2 = doc_gen.next() if self.doc_ops[0] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_1, value_1) if self.doc_ops[1] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_2, value_2) sync_op = Thread(target=self.crud, args=[client, self.doc_ops[0], key_1], kwargs={ "value": value_1, "durability": self.durability_level, "expected_thread_val": 1 }) async_op = Thread(target=self.crud, args=[client, self.doc_ops[1], key_2], kwargs={ "value": value_2, "expected_thread_val": 0 }) cb_err = CouchbaseError(self.log, self.node_data[cluster_node]["shell"]) cb_err.create(simulate_error, self.bucket.name) # Start doc_ops sync_op.start() self.sleep(1, "Wait before async operation") async_op.start() # Wait for ops to complete async_op.join() cb_err.revert(simulate_error, self.bucket.name) sync_op.join() self.validate_test_failure()
def MB36948(self): node_to_stop = self.servers[0] self.log.info("Adding index/query node") self.task.rebalance([self.cluster.master], [self.servers[2]], [], services=["n1ql,index"]) self.log.info("Creating SDK client connection") client = SDKClient([self.cluster.master], self.bucket_util.buckets[0], compression_settings=self.sdk_compression) self.log.info("Stopping memcached on: %s" % node_to_stop) ssh_conn = RemoteMachineShellConnection(node_to_stop) err_sim = CouchbaseError(self.log, ssh_conn) err_sim.create(CouchbaseError.STOP_MEMCACHED) result = client.crud("create", "abort1", "abort1_val") if not result["status"]: self.log_failure("Async SET failed") result = client.crud("update", "abort1", "abort1_val", durability=self.durability_level, timeout=3, time_unit="seconds") if result["status"]: self.log_failure("Sync write succeeded") if SDKException.DurabilityAmbiguousException not in result["error"]: self.log_failure("Invalid exception for sync_write: %s" % result) self.log.info("Resuming memcached on: %s" % node_to_stop) err_sim.revert(CouchbaseError.STOP_MEMCACHED) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(1) self.log.info("Closing ssh & SDK connections") ssh_conn.disconnect() client.close() self.validate_test_failure()
def test_prometheus_and_ns_server_stats_after_failure_scenarios(self): """ Run all metrics before and after failure scenarios and validate both ns_server and prometheus stats """ self.bucket_util.load_sample_bucket(self.cluster, TravelSample()) target_node = self.servers[0] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) self.log.info("Before failure") self.get_all_metrics(self.components, self.parse, self.metric_name) try: # Induce the error condition error_sim.create(self.simulate_error) self.sleep(20, "Wait before reverting the error condition") finally: # Revert the simulated error condition and close the ssh session error_sim.revert(self.simulate_error) remote.disconnect() self.log.info("After failure") self.get_all_metrics(self.components, self.parse, self.metric_name)
def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) def_bucket = self.bucket_util.buckets[0] target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = self.getVbucketNumbers(remote, def_bucket.name, self.target_node) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return # Create doc_generator targeting only the active/replica vbuckets # present in the target_node gen_load = doc_generator(self.key, self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_load, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) else: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", exp=0, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, skip_read_on_error=True) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(task) if not self.atomicity: if len(task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Create SDK connection for CRUD retries sdk_client = SDKClient([self.cluster.master], def_bucket) for doc_key, crud_result in task.fail.items(): result = sdk_client.crud("create", doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection sdk_client.close() # Update self.num_items self.num_items += self.new_docs_to_add if not self.atomicity: # Validate doc count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.validate_test_failure()
def test_rollback_n_times(self): doc_loading_task_2 = None ep_queue_size_map = dict() vb_replica_queue_size_map = dict() expected_num_items = \ self.bucket_util.get_expected_total_num_items(self.bucket) keys_to_verify = ["max_visible_seqno", "num_items", "high_completed_seqno", "purge_seqno"] if self.nodes_init < 2 or self.num_replicas < 1: self.fail("Not enough nodes/replicas to test rollback") # Fetch vbucket stats for validation self.get_vb_details_cbstats_for_all_nodes("pre_rollback") target_node = choice(self.kv_nodes) shell = self.node_shells[target_node]["shell"] error_sim = CouchbaseError(self.log, shell) cb_stats = self.node_shells[target_node]["cbstat"] self.target_vbuckets = cb_stats.vbucket_list(self.bucket.name) for _ in xrange(1, self.num_rollbacks + 1): self.total_rollback_items = 0 error_sim.create(CouchbaseError.STOP_PERSISTENCE, self.bucket.name) doc_loading_task_1 = self.load_docs(self.doc_ops) if self.rollback_with_multiple_mutation: doc_loading_task_2 = self.load_docs("update") for node in self.cluster.nodes_in_cluster: ep_queue_size = 0 if node.ip == target_node.ip: ep_queue_size = self.total_rollback_items if self.sync_write_enabled: # Includes prepare+commit mutation ep_queue_size *= 2 ep_queue_size_map.update({node: ep_queue_size}) vb_replica_queue_size_map.update({node: 0}) self.log.info("Validating stats") for bucket in self.bucket_util.buckets: self.bucket_util._wait_for_stat(bucket, ep_queue_size_map, timeout=self.wait_timeout) self.bucket_util._wait_for_stat( bucket, vb_replica_queue_size_map, stat_name="vb_replica_queue_size", timeout=self.wait_timeout) if self.rollback_with_multiple_mutation: self.__rewind_doc_index(doc_loading_task_2) self.__rewind_doc_index(doc_loading_task_1) error_sim.create(CouchbaseError.KILL_MEMCACHED) self.assertTrue(self.bucket_util._wait_warmup_completed( [target_node], self.bucket, wait_time=300)) self.bucket_util.verify_stats_all_buckets(expected_num_items, timeout=120) self.get_vb_details_cbstats_for_all_nodes("post_rollback") self.validate_seq_no_post_rollback("pre_rollback", "post_rollback", keys_to_verify) self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def test_durability_abort(self): """ Test to validate durability abort is triggered properly with proper rollback on active vbucket :return: """ load_task = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(server) cb_err = CouchbaseError(self.log, ssh_shell) target_vb_type = "replica" if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" target_vbs = cbstats.vbucket_list(self.bucket.name, target_vb_type) doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 2 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbs doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] \ = self.durability_level doc_load_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.DurabilityAmbiguousException ] doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 2 doc_load_spec[MetaCrudParams.SKIP_READ_ON_ERROR] = True doc_load_spec[MetaCrudParams.SUPPRESS_ERROR_TABLE] = True cb_err.create(self.simulate_error, self.cluster.buckets[0].name) load_task[server] = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, batch_size=1, validate_task=False) cb_err.revert(self.simulate_error, self.cluster.buckets[0].name) ssh_shell.disconnect() self.validate_test_failure() failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification failed " "after aborts") self.validate_test_failure() # Retry aborted keys with healthy cluster self.log.info("Performing CRUDs on healthy cluster") for server in kv_nodes: self.bucket_util.validate_doc_loading_results(load_task[server]) if load_task[server].result is False: self.log_failure("Doc retry task failed on %s" % server.ip) # Update cbstat vb-details verification counters for bucket, s_dict in load_task[server].loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, _ in c_dict["collections"].items(): c_crud_data = load_task[server].loader_spec[bucket][ "scopes"][s_name]["collections"][c_name] for op_type in c_crud_data.keys(): total_mutation = \ c_crud_data[op_type]["doc_gen"].end \ - c_crud_data[op_type]["doc_gen"].start if op_type in DocLoading.Bucket.DOC_OPS: self.verification_dict["ops_%s" % op_type] \ += total_mutation self.verification_dict[ "sync_write_committed_count"] \ += total_mutation failed = self.durability_helper.verify_vbucket_details_stats( self.bucket, self.cluster_util.get_kv_nodes(self.cluster), vbuckets=self.cluster.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification " "failed after ops on server: %s" % server.ip) self.validate_test_failure()
def validate_durability_with_crud( self, bucket, bucket_durability, verification_dict, doc_start_index=0, num_items_to_load=1, op_type="create", doc_durability=Bucket.DurabilityLevel.NONE): """ Common API to validate durability settings of the bucket is set correctly or not. :param bucket: Bucket object to validate :param bucket_durability: Durability set for the bucket Note: Need this because the string within the bucket object is different than this. :param verification_dict: To hold the values for req cbstats to verify :param doc_start_index: Starting index to be considered for doc_load :param num_items_to_load: Number of items to be loaded to test. Default is '1' :param op_type: Type of CRUD to perform. Default is 'create' :param doc_durability: Document durability level to use during CRUD. Default level is 'None' :return: """ def get_d_level_used(): if self.d_level_order.index(bucket_durability) \ < self.d_level_order.index(doc_durability): return doc_durability return bucket_durability d_level_to_test = get_d_level_used() # Nothing to test for durability_level=None (async_write case) if d_level_to_test == Bucket.DurabilityLevel.NONE: return self.log.info("Performing %s operation to validate d_level %s" % (op_type, d_level_to_test)) # Can't simulate error conditions for all durability_levels. # So only perform CRUD without error_sim if len(self.vbs_in_node.keys()) > 1: # Pick a random node to perform error sim and load random_node = choice(self.vbs_in_node.keys()) target_vb_type, simulate_error = \ self.durability_helper.get_vb_and_error_type(d_level_to_test) doc_gen = doc_generator( self.key, doc_start_index, num_items_to_load, target_vbucket=self.vbs_in_node[random_node][target_vb_type]) error_sim = CouchbaseError(self.log, self.vbs_in_node[random_node]["shell"]) doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=doc_durability, timeout_secs=32, batch_size=1, skip_read_on_error=True, suppress_error_table=True, start_task=False, sdk_client_pool=self.sdk_client_pool) self.sleep(5, "Wait for sdk_client to get warmed_up") # Simulate target error condition error_sim.create(simulate_error) self.sleep(5, "Wait for error_sim to take effect") # Start doc_loading task and wait for it to complete self.task_manager.add_new_task(doc_load_task) self.task_manager.get_task_result(doc_load_task) # Revert the induced error condition self.sleep(5, "Wait before reverting error_simulation") error_sim.revert(simulate_error) # Validate failed doc count and exception type from SDK if not doc_load_task.fail.keys(): self.log_failure("Docs inserted without honoring the " "bucket durability level") for key, result in doc_load_task.fail.items(): if SDKException.DurabilityAmbiguousException \ not in str(result["error"]): self.log_failure("Invalid exception for key %s " "during %s operation: %s" % (key, op_type, result["error"])) verification_dict["sync_write_aborted_count"] += num_items_to_load else: doc_gen = doc_generator(self.key, doc_start_index, doc_start_index+num_items_to_load) # Retry the same CRUDs without any error simulation in place doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, durability=doc_durability, timeout_secs=2, batch_size=1, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(doc_load_task) if doc_load_task.fail: self.log_failure("Failures seen during CRUD without " "error simulation. Keys failed: %s" % doc_load_task.fail.keys()) else: verification_dict["ops_%s" % op_type] += \ num_items_to_load verification_dict["sync_write_committed_count"] += \ num_items_to_load
def test_scenario(bucket, doc_ops, with_sync_write_val=None): # Set crud_batch_size crud_batch_size = 4 simulate_error = CouchbaseError.STOP_MEMCACHED # Fetch target_vbs for CRUDs node_vb_info = self.vbs_in_node target_vbuckets = node_vb_info[target_nodes[0]]["replica"] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(node_vb_info[target_nodes[index]]["replica"])) ) index += 1 # Variable to hold one of the doc_generator objects gen_loader_1 = None gen_loader_2 = None # Initialize doc_generators to use for testing self.log.info("Creating doc_generators") gen_create = doc_generator( self.key, self.num_items, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) gen_update = doc_generator( self.key, 0, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets, mutate=1) gen_delete = doc_generator( self.key, 0, crud_batch_size, vbuckets=self.cluster.vbuckets, target_vbucket=target_vbuckets) self.log.info("Done creating doc_generators") # Start CRUD operation based on the given 'doc_op' type if doc_ops[0] == "create": self.num_items += crud_batch_size gen_loader_1 = gen_create elif doc_ops[0] in ["update", "replace", "touch"]: gen_loader_1 = gen_update elif doc_ops[0] == "delete": gen_loader_1 = gen_delete self.num_items -= crud_batch_size if doc_ops[1] == "create": gen_loader_2 = gen_create elif doc_ops[1] in ["update", "replace", "touch"]: gen_loader_2 = gen_update elif doc_ops[1] == "delete": gen_loader_2 = gen_delete # Load required docs for doc_op_1 in case of type != create if doc_op[2] == "load_initial_docs": doc_loading_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_loader_1, "create", 0, batch_size=crud_batch_size, process_concurrency=1, timeout_secs=10, print_ops_rate=False, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(doc_loading_task) if doc_loading_task.fail: self.log_failure("Failure while loading initial docs") self.summary.add_step("Create docs for %s" % doc_op[0]) verification_dict["ops_create"] += crud_batch_size verification_dict["sync_write_committed_count"] \ += crud_batch_size # Initialize tasks and store the task objects doc_loader_task = self.task.async_load_gen_docs( self.cluster, bucket, gen_loader_1, doc_ops[0], 0, batch_size=crud_batch_size, process_concurrency=8, timeout_secs=60, print_ops_rate=False, start_task=False, sdk_client_pool=self.sdk_client_pool) # SDK client for performing individual ops client = SDKClient([self.cluster.master], bucket) # Perform specified action for node in target_nodes: error_sim = CouchbaseError(self.log, self.vbs_in_node[node]["shell"]) error_sim.create(simulate_error, bucket_name=bucket.name) self.sleep(5, "Wait for error simulation to take effect") self.task_manager.add_new_task(doc_loader_task) self.sleep(5, "Wait for task_1 CRUDs to reach server") # Perform specified CRUD operation on sync_write docs tem_gen = deepcopy(gen_loader_2) while tem_gen.has_next(): key, value = tem_gen.next() for retry_strategy in [ SDKConstants.RetryStrategy.FAIL_FAST, SDKConstants.RetryStrategy.BEST_EFFORT]: if with_sync_write_val: fail = client.crud(doc_ops[1], key, value=value, exp=0, durability=with_sync_write_val, timeout=3, time_unit="seconds", sdk_retry_strategy=retry_strategy) else: fail = client.crud(doc_ops[1], key, value=value, exp=0, timeout=3, time_unit="seconds", sdk_retry_strategy=retry_strategy) expected_exception = SDKException.AmbiguousTimeoutException retry_reason = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if retry_strategy == SDKConstants.RetryStrategy.FAIL_FAST: expected_exception = \ SDKException.RequestCanceledException retry_reason = \ SDKException.RetryReason \ .KV_SYNC_WRITE_IN_PROGRESS_NO_MORE_RETRIES # Validate the returned error from the SDK if expected_exception not in str(fail["error"]): self.log_failure("Invalid exception for {0}: {1}" .format(key, fail["error"])) if retry_reason not in str(fail["error"]): self.log_failure("Invalid retry reason for {0}: {1}" .format(key, fail["error"])) # Try reading the value in SyncWrite in-progress state fail = client.crud("read", key) if doc_ops[0] == "create": # Expected KeyNotFound in case of CREATE operation if fail["status"] is True: self.log_failure( "%s returned value during SyncWrite state: %s" % (key, fail)) else: # Expects prev value in case of other operations if fail["status"] is False: self.log_failure( "Key %s read failed for previous value: %s" % (key, fail)) # Revert the introduced error condition for node in target_nodes: error_sim = CouchbaseError(self.log, self.vbs_in_node[node]["shell"]) error_sim.revert(simulate_error, bucket_name=bucket.name) # Wait for doc_loader_task to complete self.task.jython_task_manager.get_task_result(doc_loader_task) verification_dict["ops_%s" % doc_op[0]] += crud_batch_size verification_dict["sync_write_committed_count"] \ += crud_batch_size # Disconnect the client client.close()
def test_update_durability_between_doc_op(self): """ 1. Create Bucket with durability level set. 2. Bring down a node such that durability CRUD will wait 3. Perform doc_op and update bucket_level_durability 4. Revert scenario induced in step#2, such that doc_op will complete 5. Make sure doc_ops in step#3 went through using prev. d-level """ # Starting from max_durability levels because to iterate # all lower levels for doc_ops with level update supported_d_levels = deepcopy(self.d_level_order) if self.bucket_type == Bucket.Type.EPHEMERAL: supported_d_levels = supported_d_levels[0:2] supported_d_levels.reverse() supported_d_levels += [supported_d_levels[0]] create_desc = "Creating %s bucket with level '%s'" \ % (self.bucket_type, supported_d_levels[0]) self.log.info(create_desc) bucket_dict = self.get_bucket_dict(self.bucket_type, supported_d_levels[0]) # Object to support performing CRUDs and create Bucket bucket_obj = Bucket(bucket_dict) self.bucket_util.create_bucket(self.cluster, bucket_obj, wait_for_warmup=True) self.get_vbucket_type_mapping(bucket_obj.name) self.summary.add_step(create_desc) self.bucket_util.print_bucket_stats(self.cluster) # Loop to update all other durability levels prev_d_level = supported_d_levels[0] for bucket_durability in supported_d_levels[1:]: target_vb_type, simulate_error = \ self.durability_helper.get_vb_and_error_type(bucket_durability) # Pick a random node to perform error sim and load random_node = choice(self.vbs_in_node.keys()) error_sim = CouchbaseError( self.log, self.vbs_in_node[random_node]["shell"]) target_vbs = self.vbs_in_node[random_node][target_vb_type] doc_gen = doc_generator(self.key, 0, 1, target_vbucket=target_vbs) doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket_obj, doc_gen, "update", durability=Bucket.DurabilityLevel.NONE, timeout_secs=60, start_task=False, sdk_client_pool=self.sdk_client_pool) # Simulate target error condition error_sim.create(simulate_error) self.sleep(5, "Wait before starting doc_op") self.task_manager.add_new_task(doc_load_task) new_d_level = BucketDurability[bucket_durability] self.sleep(5, "Wait before updating bucket level " "durability=%s" % new_d_level) self.bucket_util.update_bucket_property( self.cluster.master, bucket_obj, bucket_durability=new_d_level) self.bucket_util.print_bucket_stats(self.cluster) buckets = self.bucket_util.get_all_buckets(self.cluster) if buckets[0].durability_level != new_d_level: self.log_failure("Failed to update bucket_d_level to %s" % new_d_level) self.summary.add_step("Set bucket-durability=%s" % new_d_level) if prev_d_level == Bucket.DurabilityLevel.NONE: if not doc_load_task.completed: self.log_failure("Doc-op still pending for d_level 'NONE'") elif doc_load_task.completed: self.log_failure("Doc-op completed before reverting the " "error condition: %s" % simulate_error) # Revert the induced error condition error_sim.revert(simulate_error) self.task_manager.get_task_result(doc_load_task) if doc_load_task.fail: self.log_failure("Doc_op failed") self.summary.add_step("Doc_op with previous d_level %s" % prev_d_level) prev_d_level = bucket_durability # Delete the bucket on server self.bucket_util.delete_bucket(self.cluster, bucket_obj) self.summary.add_step("Delete %s bucket" % self.bucket_type)
def test_maxttl_with_timeout(self): """ 1. Stop Memcached on target_nodes based on replicas configured. 2. Initiate doc_ops with higher sdk_timeout 3. Sleep for time within the configured sdk_timeout 4. Resume Memcached on target_nodes to make sure doc_ops go through 5. Make sure maxTTL is calculated as soon as the active vbucket receives the mutation :return: """ shell_conn = dict() target_vbuckets = list() target_nodes = self.getTargetNodes() def_bucket = self.cluster.buckets[0] self.maxttl = self.input.param("doc_ttl", self.maxttl) # Open required SDK connections before error_simulation gen_create = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster.vbuckets) doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", self.maxttl, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, start_task=False, sdk_client_pool=self.sdk_client_pool) # Open shell_conn and create Memcached error for testing MaxTTL self.log.info("1. Stopping Memcached on target_nodes") for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstats = Cbstats(shell_conn[node.ip]) target_vbuckets += cbstats.vbucket_list(def_bucket.name, "replica") cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.create(CouchbaseError.STOP_MEMCACHED, def_bucket.name) self.log.info("2. Initiating the doc_ops with doc TTL") self.task_manager.add_new_task(doc_op_task) self.sleep(self.maxttl, "3. Sleep for max_ttl time") # Revert Memcached error and close the shell_conn self.log.info("4. Resuming Memcached on target_nodes") for node in target_nodes: cb_error = CouchbaseError(self.log, shell_conn[node.ip]) cb_error.revert(CouchbaseError.STOP_MEMCACHED, def_bucket.name) shell_conn[node.ip].disconnect() self.log.info("5. Waiting for doc_ops to complete") self.task.jython_task_manager.get_task_result(doc_op_task) self.bucket_util._expiry_pager(self.cluster, val=1) self.sleep(10, "6. Waiting for items to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("7. Validating docs expired after TTL, " "even before sync_write succeeds") if len(doc_op_task.success.keys()) == self.num_items: self.fail("No docs deleted after MaxTTL time: %s" % doc_op_task.success.keys()) self.sleep(10, "8. Waiting for all docs to be purged") # Read all expired docs to validate all keys present doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "read", batch_size=10, process_concurrency=8, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("9. Validating docs expired after TTL") if len(doc_op_task.fail.keys()) != self.num_items: self.fail("Items not deleted after MaxTTL time: %s" % doc_op_task.success.keys()) # Validate cas for purged items keys_with_cas = list() for key, result in doc_op_task.fail.items(): if result['cas'] != 0: keys_with_cas.append(key) if len(keys_with_cas) != 0: self.fail("Following failed keys has CAS: %s" % keys_with_cas) # Recreate all docs without any node issues doc_op_task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", 0, batch_size=10, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(doc_op_task) self.log.info("10. Validating docs exists after creation") if len(doc_op_task.fail.keys()) != 0: self.fail("Doc recreate failed for keys: %s" % doc_op_task.fail.keys()) # Final doc_count validation self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
def test_doc_size_exceptions(self): """ Basic tests for document CRUD operations using JSON docs """ # self.sleep(10, "Wait for bucket to finish warm-up") def_bucket = self.bucket_util.buckets[0] self.cluster_util.add_node(self.servers[1]) if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = doc_generator(self.key, 0, 100, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.vbuckets) self.log.info("doc_generator created") unwanted, retried = self.bucket_util.load_bucket_exceptions( self.cluster, def_bucket, doc_create, "create", 0, batch_size=10, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, durability="majority") err = CouchbaseError(self.log, RemoteMachineShellConnection(self.servers[1])) err.create(CouchbaseError.STOP_MEMCACHED) doc_create = doc_generator(self.key, 100, 200, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.vbuckets) unwanted, retried = self.bucket_util.load_bucket_exceptions( self.cluster, def_bucket, doc_create, "create", 0, batch_size=100, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, durability="majority", ignore_exceptions=["RequestTimeoutException"]) err.revert(CouchbaseError.STOP_MEMCACHED) unwanted, retried = self.bucket_util.load_bucket_exceptions( self.cluster, def_bucket, doc_create, "create", 0, batch_size=100, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, durability="majority")
def test_create_remove_collection_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after collection action 3. Initiate collection creation/deletion under the bucket 4. Validate the outcome of collection creation/deletion """ def create_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.create_collection(collection, scope) self.bucket_util.create_collection_object(bucket_obj, scope, {"name": collection}) elif client_type == "rest": self.bucket_util.create_collection(self.cluster.master, bucket_obj, scope, {"name": collection}) else: self.log_failure("Invalid client_type provided") def remove_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.drop_collection(scope, collection) self.bucket_util.mark_collection_as_dropped(bucket_obj, scope, collection) elif client_type == "rest": self.bucket_util.drop_collection(self.cluster.master, bucket_obj, scope, collection) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None task = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) if self.scope_name != CbServer.default_scope: self.scope_name = \ BucketUtils.get_random_name( max_length=CbServer.max_scope_name_len) self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = \ BucketUtils.get_random_name( max_length=CbServer.max_collection_name_len) # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] client = self.sdk_client_pool.get_client_for_bucket(self.bucket) use_client = sample(["sdk", "rest"], 1)[0] if action == "remove" \ and self.collection_name != CbServer.default_collection: # Create a collection to be removed create_collection(use_client, self.bucket, self.scope_name, self.collection_name) # Create a error scenario self.log.info("Selected scenario for test '%s'" % crash_type) shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) if action == "create": create_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) elif action == "remove": remove_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def test_create_remove_scope_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after scope create/delete 3. Initiate scope creation/deletion under the bucket 4. Validate the outcome of scope creation/deletion """ def create_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.create_scope(scope) elif client_type == "rest": self.bucket_util.create_scope(self.cluster.master, bucket_obj, {"name": scope}) else: self.log_failure("Invalid client_type provided") def remove_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.drop_scope(scope) elif client_type == "rest": self.bucket_util.drop_scope(self.cluster.master, bucket_obj, scope) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) # Always use a random scope name to create/remove # since CREATE/DROP not supported for default scope self.scope_name = BucketUtils.get_random_name() # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] # Create a required client object if self.client_type == "sdk": client = SDKClient([self.cluster.master], self.bucket) if action == "remove": # Create a scope to be removed use_client = sample(["sdk", "rest"], 1)[0] create_scope(use_client, self.bucket, self.scope_name) # Create a error scenario shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if action == "create": create_scope(self.client_type, self.bucket, self.scope_name) elif action == "remove": remove_scope(self.client_type, self.bucket, self.scope_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, "update", exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.client_type == "sdk": client.close() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
class BucketWarmup(CollectionBase): def setUp(self): super(BucketWarmup, self).setUp() self.load_spec = self.input.param("load_spec", "def_load_random_collection") self.bucket = self.bucket_util.buckets[0] def create_scope(self): self.bucket_util.create_scope(self.cluster.master, self.bucket, self.scope_name) def drop_scope(self): self.bucket_util.drop_scope(self.cluster.master, self.bucket, self.scope_name) del self.bucket.scopes[self.scope_name] def create_collection(self): self.bucket_util.create_collection(self.cluster.master, self.bucket, CbServer.default_scope, self.collection_name) def drop_collection(self): self.bucket_util.drop_collection(self.cluster.master, self.bucket, self.scope_name, self.collection_name) del self.bucket.scopes[scope_name] \ .collections[self.collection_name] def random_load(self): doc_loading_spec = \ self.bucket_util.get_crud_template_from_package(self.load_spec) self.bucket_util.run_scenario_from_spec(self.task, self.cluster, self.bucket_util.buckets, doc_loading_spec, mutation_num=0) def perform_operation_during_bucket_warmup(self, during_warmup="default"): # stop memcached in master node shell_conn = RemoteMachineShellConnection(self.cluster.master) self.error_sim = CouchbaseError(self.log, shell_conn) self.error_sim.create(CouchbaseError.STOP_MEMCACHED) self.log.info("memcached stopped on master node") if during_warmup == "create_scope": try: self.scope_name = self.bucket_util.get_random_name() self.create_scope() self.log_failure("drop scope succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.create_scope() elif during_warmup == "drop_scope": retry = 5 while retry > 0: scope_dict = self.bucket_util.get_random_scopes( self.bucket_util.buckets, 1, 1) self.scope_name = scope_dict[ self.bucket.name]["scopes"].keys()[0] if self.scope_name != "_default": break retry -= 1 try: self.drop_scope() self.log_failure("drop scope succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.drop_scope() elif during_warmup == "create_collection": self.collection_name = self.bucket_util.get_random_name() try: self.create_collection() self.log_failure("create collection succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.create_collection() elif during_warmup == "drop_collection": collections = self.bucket_util.get_random_collections( self.bucket_util.buckets, 1, 1, 1) scope_dict = collections[self.bucket.name]["scopes"] self.scope_name = scope_dict.keys()[0] self.collection_name = scope_dict[scope_name]["collections"].keys( )[0] try: self.drop_collection() self.log_failure("drop collection succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.drop_collection() else: try: self.random_load() self.log_failure("random operation succeeded") except Exception as e: self.log.info(e) self.error_sim.revert(CouchbaseError.STOP_MEMCACHED) self.random_load() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure() def test_create_scope_during_warmup(self): self.perform_operation_during_bucket_warmup("create_scope") def test_drop_scope_during_warmup(self): self.perform_operation_during_bucket_warmup("drop_scope") def test_create_collection_during_warmup(self): self.perform_operation_during_bucket_warmup("create_collection") def test_delete_collection_during_warmup(self): self.perform_operation_during_bucket_warmup("drop_collection") def test_perform_random_operation_during_warmup(self): self.perform_operation_during_bucket_warmup() def tearDown(self): self.error_sim.revert(CouchbaseError.STOP_MEMCACHED)
def test_fail_node_during_meta_key_delete(self): """ 1. Create a meta_kv key entry 2. Stop any random node and trigger a meta_kv key delete operation 3. Recover back the node and make sure the purging goes through fine """ custom_meta_kv_key = "key_01_%s" % self.time_stamp fts_key = "fts_index_%s" % int(self.time_stamp) rest = RestConnection(self.cluster.master) random_node = choice(self.cluster.servers[1:]) if random_node.ip == self.cluster.fts_nodes[0].ip: self.fts_helper = FtsHelper(self.cluster.fts_nodes[1]) self.log.info("Stopping meta_kv purger autorun") rest.disable_tombstone_purger() # Creating meta_kv keys self.__fts_index(self.op_create, fts_key, self.cluster.buckets[0], CbServer.default_scope, CbServer.default_collection) self.__perform_meta_kv_op_on_rand_key(self.op_create, custom_meta_kv_key, "value") shell = RemoteMachineShellConnection(random_node) cb_err = CouchbaseError(self.log, shell) self.sleep(5, "Waiting for meta_kv sync to happen") # Stopping ns_server on random node if choice([True, False]): self.log.info("Stopping couchbase server gracefully on %s" % random_node.ip) shell.stop_couchbase() else: cb_err.create(cb_err.KILL_BEAMSMP) # Removing meta_kv keys when one of the node is down self.__fts_index(self.op_remove, fts_key, self.cluster.buckets[0], CbServer.default_scope, CbServer.default_collection) self.__perform_meta_kv_op_on_rand_key(self.op_remove, custom_meta_kv_key) self.sleep(15, "Wait before triggering purger") rest.run_tombstone_purger(10) purged_keys_dict = self.__get_purged_tombstone_from_last_run() for node_ip, purged_data in purged_keys_dict.items(): if purged_data['count'] or purged_data['count'] != 0: shell.disconnect() self.fail("%s - Some keys got purged when node is failed: %s" % (node_ip, purged_data)) # Recover from the error condition self.log.info("Healing the cluster node") shell.restart_couchbase() shell.disconnect() self.sleep(60, "Wait for node to come online") deleted_keys = self.cluster_util.get_ns_config_deleted_keys_count() self.log.info(deleted_keys) del_key_count = None for node_ip, curr_count in deleted_keys.items(): if del_key_count is None: del_key_count = curr_count elif del_key_count != curr_count: self.fail( "%s - Deleted keys count mismatch. Expected %s, got %s" % (node_ip, del_key_count, curr_count)) rest.run_tombstone_purger(10) # Validate the key has been deleted from meta_kv purged_keys_dict = self.__get_purged_tombstone_from_last_run() for node_ip, purged_data in purged_keys_dict.items(): if purged_data['count'] == 0: self.fail("%s - No keys purged: %s" % (node_ip, purged_data)) if purged_data['count'] != del_key_count: self.fail("%s - Purged key count mismatch. Expected %s, got %s" % (node_ip, del_key_count, purged_data['count'])) if custom_meta_kv_key not in purged_data['keys']: self.fail("%s - Key %s missing in purger: %s" % (node_ip, custom_meta_kv_key, purged_data['keys']))
def test_fail_node_during_purge_run(self): """ 1. Create and remove meta_kv key(s) 2. Hard failover a random node (non-orchestrator) such that the purger gets triggered as expected 3. Let the purger run complete """ t_key = "fts_index-%s" % int(self.time_stamp) + "-%s" rest = RestConnection(self.cluster.master) self.log.info("Stopping meta_kv purger autorun") rest.disable_tombstone_purger() self.log.info("Creating fts_index tombstones") for index in range(self.num_index): key = t_key % index self.__fts_index(self.op_create, key, self.cluster.buckets[0], CbServer.default_scope, CbServer.default_collection) self.__fts_index(self.op_remove, key, self.cluster.buckets[0], CbServer.default_scope, CbServer.default_collection) deleted_keys = self.cluster_util.get_ns_config_deleted_keys_count() del_key_count = None for node_ip, curr_count in deleted_keys.items(): if del_key_count is None: del_key_count = curr_count elif del_key_count != curr_count: self.fail( "%s - Deleted keys count mismatch. Expected %s, got %s" % (node_ip, del_key_count, curr_count)) random_node = choice(self.cluster.servers[1:]) shell = RemoteMachineShellConnection(random_node) cb_err = CouchbaseError(self.log, shell) if choice([True, False]): cb_err.create(cb_err.KILL_BEAMSMP) else: self.log.info("Stopping couchbase server gracefully on %s" % random_node.ip) shell.stop_couchbase() self.sleep(15, "Wait before triggering purger") self.log.info("Triggering purger when a node is in failed state") rest.run_tombstone_purger(10) purged_keys_dict = self.__get_purged_tombstone_from_last_run() for node_ip, purged_data in purged_keys_dict.items(): if purged_data['count'] or purged_data['count'] != 0: shell.disconnect() self.fail("%s - Some keys got purged when node is failed: %s" % (node_ip, purged_data)) self.log.info("Healing node with server restart") shell.restart_couchbase() shell.disconnect() self.sleep(60, "Wait for node to come online") self.log.info("Triggering purger when a node is in failed state") rest.run_tombstone_purger(10) self.sleep(10, "Wait for purger to run") purged_keys_dict = self.__get_purged_tombstone_from_last_run() for node_ip, purged_data in purged_keys_dict.items(): if purged_data['count'] == 0: self.fail("%s - No keys purged: %s" % (node_ip, purged_data)) if purged_data['count'] != del_key_count: self.fail("%s - Purged key count mismatch. Expected %s, got %s" % (node_ip, del_key_count, purged_data['count']))
def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = CrashTest.getVbucketNumbers( remote, self.bucket.name, self.target_node) bucket_dict = BucketUtils.get_random_collections( self.cluster.buckets, req_num=1, consider_scopes="all", consider_buckets="all") bucket = BucketUtils.get_bucket_obj(self.cluster.buckets, bucket_dict.keys()[0]) scope_name = bucket_dict[bucket.name]["scopes"].keys()[0] collection_name = bucket_dict[bucket.name][ "scopes"][scope_name]["collections"].keys()[0] scope = BucketUtils.get_scope_obj( bucket, scope_name) collection = BucketUtils.get_collection_obj(scope, collection_name) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return self.start_doc_loading_tasks(target_vbuckets, scope_name, collection) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(self.doc_loading_task) if self.atomicity: self.task.jython_task_manager.get_task_result( self.transaction_load_task) elif self.N1qltxn: self.task.jython_task_manager.get_task_result( self.N1ql_load_task) if len(self.doc_loading_task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % self.doc_loading_task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( self.doc_loading_task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Get SDK client for CRUD retries sdk_client = self.sdk_client_pool.get_client_for_bucket(self.bucket) for doc_key, crud_result in self.doc_loading_task.fail.items(): result = sdk_client.crud(DocLoading.Bucket.DocOps.CREATE, doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection self.sdk_client_pool.release_client(sdk_client) self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets) # Update self.num_items and validate docs per collection if not self.N1qltxn and self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
def test_concurrent_failover_timer_reset(self): """ 1. Trigger failure on destined nodes 2. Wait for little less time than failover_timeout 3. Bring back few nodes back online for few seconds 4. Make sure no auto failover triggered till next failover timeout 5. Validate auto failovers after new timeout """ services_to_fo = self.failover_order[0].split(":") self.nodes_to_fail = self.get_nodes_to_fail(services_to_fo, dynamic_fo_method=True) expected_fo_nodes = self.num_nodes_to_be_failover self.__update_server_obj() rand_node = choice(self.nodes_to_fail.keys()) self.__update_unaffected_node() self.__display_failure_node_status("Nodes to be failed") try: self.log.info("Starting auto-failover procedure") failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="induce_failure") self.task_manager.add_new_task(failover_task) self.sleep(int(self.timeout * 0.7), "Wait before bringing back the failed nodes") self.log.info("Bringing back '%s' for some time" % rand_node.ip) new_timer = None shell = RemoteMachineShellConnection(rand_node) cb_err = CouchbaseError(self.log, shell) if self.nodes_to_fail[rand_node] == CouchbaseError.STOP_MEMCACHED: cb_err.revert(CouchbaseError.STOP_MEMCACHED) self.sleep(10, "Wait before creating failure again") cb_err.create(CouchbaseError.STOP_MEMCACHED) new_timer = time() elif self.nodes_to_fail[rand_node] == "stop_couchbase": cb_err.revert(CouchbaseError.STOP_SERVER) self.sleep(10, "Wait before creating failure again") cb_err.create(CouchbaseError.STOP_SERVER) new_timer = time() shell.disconnect() # Validate the previous auto-failover task failed # due to the random_node coming back online self.task_manager.get_task_result(failover_task) self.assertFalse(failover_task.result, "Nodes failed over though nodes became active") # Validate auto_failover_settings self.validate_failover_settings(True, self.timeout, 0, self.max_count) # Make sure the new auto-failover timing is honoured new_timer = new_timer + self.timeout while int(time()) < new_timer: settings = self.rest.get_autofailover_settings() if settings.count != 0: self.fail("Nodes failed over before new failover time") self.sleep(10, "Wait for failover rebalance to trigger") self.rest.monitorRebalance() # Validate auto_failover_settings after actual auto failover self.validate_failover_settings(True, self.timeout, expected_fo_nodes, self.max_count) finally: # Recover all nodes from induced failures failover_task = ConcurrentFailoverTask( task_manager=self.task_manager, master=self.orchestrator, servers_to_fail=self.nodes_to_fail, expected_fo_nodes=expected_fo_nodes, task_type="revert_failure") self.task_manager.add_new_task(failover_task) self.task_manager.get_task_result(failover_task) self.log.info("Rebalance out the failed nodes") result = self.cluster_util.rebalance(self.cluster) self.assertTrue(result, "Final rebalance failed") # Perform collection crud + doc_ops after rebalance operation self.__perform_doc_ops()