def setUp(self): super(basic_ops, self).setUp() self.key = 'test_docs'.rjust(self.key_size, '0') nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.bucket_util.create_default_bucket( replica=self.num_replicas, compression_mode=self.compression_mode, bucket_type=self.bucket_type) self.bucket_util.add_rbac_user() self.src_bucket = self.bucket_util.get_all_buckets() self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) # Reset active_resident_threshold to avoid further data load as DGM self.active_resident_threshold = 0 self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.log.info("==========Finished Basic_ops base setup========")
def setUp(self): super(UpgradeTests, self).setUp() self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster)) self.verification_dict = dict() self.verification_dict["ops_create"] = self.num_items self.verification_dict["ops_delete"] = 0
def setUp(self): super(BucketDurabilityBase, self).setUp() if len(self.cluster.servers) < self.nodes_init: self.fail("Not enough nodes for rebalance") # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster)) self.kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 # Bucket create options representation self.bucket_template = dict() self.bucket_template[Bucket.name] = "default" self.bucket_template[Bucket.ramQuotaMB] = 100 self.bucket_template[Bucket.replicaNumber] = self.num_replicas if self.bucket_type == Bucket.Type.MEMBASE: self.bucket_template[Bucket.storageBackend] = self.bucket_storage # These two params will be set during each iteration self.bucket_template[Bucket.bucketType] = None self.bucket_template[Bucket.durabilityMinLevel] = None self.bucket_types_to_test = [ Bucket.Type.MEMBASE, Bucket.Type.EPHEMERAL, Bucket.Type.MEMCACHED ] self.d_level_order = [ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY, Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ] # Dict representing the possible levels supported by each bucket type self.possible_d_levels = dict() self.possible_d_levels[Bucket.Type.MEMBASE] = \ self.bucket_util.get_supported_durability_levels() self.possible_d_levels[Bucket.Type.EPHEMERAL] = [ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY ] self.possible_d_levels[Bucket.Type.MEMCACHED] = [ Bucket.DurabilityLevel.NONE ] # Dict to store the list of active/replica VBs in each node self.vbs_in_node = dict() for node in self.cluster_util.get_kv_nodes(self.cluster): shell = RemoteMachineShellConnection(node) self.vbs_in_node[node] = dict() self.vbs_in_node[node]["shell"] = shell self.log.info("===== BucketDurabilityBase setup complete =====")
def setUp(self): super(AutoFailoverBaseTest, self).setUp() self._get_params() self.rest = RestConnection(self.orchestrator) self.initial_load_gen = self.get_doc_generator(0, self.num_items) self.update_load_gen = self.get_doc_generator(0, self.update_items) self.delete_load_gen = self.get_doc_generator(self.update_items, self.delete_items) self.set_up_cluster() self.load_all_buckets(self.initial_load_gen, "create", 0) self.server_index_to_fail = self.input.param("server_index_to_fail", None) self.new_replica = self.input.param("new_replica", None) self.replica_update_during = self.input.param("replica_update_during", None) if self.server_index_to_fail is None: self.server_to_fail = self._servers_to_fail() else: self.server_to_fail = [ self.cluster.servers[self.server_index_to_fail] ] self.servers_to_add = self.cluster.servers[self. nodes_init:self.nodes_init + self.nodes_in] self.servers_to_remove = self.cluster.servers[self.nodes_init - self.nodes_out:self. nodes_init] self.durability_helper = DurabilityHelper(self.log, len(self.cluster.servers), self.durability_level) self.active_vb_in_failover_nodes = list() self.replica_vb_in_failover_nodes = list() self.get_vbucket_info_from_failover_nodes() self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats()
def test_replica_update(self): if self.atomicity: replica_count = 3 else: replica_count = 4 if self.nodes_init < 2: self.log.error("Test not supported for < 2 node cluster") return doc_ops = self.input.param("doc_ops", "") bucket_helper = BucketHelper(self.cluster.master) doc_count = self.num_items start_doc_for_insert = self.num_items self.is_sync_write_enabled = DurabilityHelper.is_sync_write_enabled( self.bucket_durability_level, self.durability_level) # Replica increment tests doc_count, start_doc_for_insert = self.generic_replica_update( doc_count, doc_ops, bucket_helper, range(1, min(replica_count, self.nodes_init)), start_doc_for_insert) # Replica decrement tests _, _ = self.generic_replica_update( doc_count, doc_ops, bucket_helper, range(min(replica_count, self.nodes_init)-2, -1, -1), start_doc_for_insert)
def setUp(self): super(CollectionBase, self).setUp() self.log_setup_status("CollectionBase", "started") self.key = 'test_collection'.rjust(self.key_size, '0') self.simulate_error = self.input.param("simulate_error", None) self.error_type = self.input.param("error_type", "memory") self.doc_ops = self.input.param("doc_ops", None) # If True, creates bucket/scope/collections with simpler names self.use_simple_names = self.input.param("use_simple_names", True) self.spec_name = self.input.param("bucket_spec", "single_bucket.default") self.data_spec_name = self.input.param("data_spec_name", "initial_load") self.remove_default_collection = \ self.input.param("remove_default_collection", False) self.action_phase = self.input.param("action_phase", "before_default_load") self.skip_collections_cleanup = \ self.input.param("skip_collections_cleanup", False) self.validate_docs_count_during_teardown = \ self.input.param("validate_docs_count_during_teardown", False) self.batch_size = self.input.param("batch_size", 200) self.process_concurrency = self.input.param("process_concurrency", 1) self.retry_get_process_num = \ self.input.param("retry_get_process_num", 200) self.change_magma_quota = self.input.param("change_magma_quota", False) self.crud_batch_size = 100 self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), self.durability_level) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120) self.assertTrue(status, msg="Failure during disabling auto-failover") self.bucket_helper_obj = BucketHelper(self.cluster.master) self.disk_optimized_thread_settings = self.input.param("disk_optimized_thread_settings", False) if self.disk_optimized_thread_settings: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") try: self.collection_setup() except Java_base_exception as exception: self.handle_setup_exception(exception) except Exception as exception: self.handle_setup_exception(exception) self.supported_d_levels = \ self.bucket_util.get_supported_durability_levels() self.log_setup_status("CollectionBase", "complete")
def setUp(self): super(RebalanceBaseTest, self).setUp() self.doc_ops = self.input.param("doc_ops", "create") self.doc_size = self.input.param("doc_size", 10) self.key_size = self.input.param("key_size", 0) self.zone = self.input.param("zone", 1) self.new_replica = self.input.param("new_replica", None) self.default_view_name = "default_view" self.defaul_map_func = "function (doc) {\n emit(doc._id, doc);\n}" self.default_view = View(self.default_view_name, self.defaul_map_func, None) self.max_verify = self.input.param("max_verify", None) self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) self.key = 'test_docs'.rjust(self.key_size, '0') nodes_init = self.cluster.servers[ 1:self.nodes_init] if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.bucket_util.create_default_bucket(replica=self.num_replicas) self.bucket_util.add_rbac_user() self.sleep(10) gen_create = self.get_doc_generator(0, self.num_items) self.print_cluster_stat_task = self.cluster_util.async_print_cluster_stats( ) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_create, "create", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, batch_size=10, timeout_secs=self.sdk_timeout, process_concurrency=8, retries=self.sdk_retries, durability=self.durability_level) self.task.jython_task_manager.get_task_result(task) self.sleep(20) current_item = self.bucket_util.get_bucket_current_item_count( self.cluster, bucket) self.num_items = current_item self.log.info("Inserted {} number of items after loadgen".format( self.num_items)) self.gen_load = self.get_doc_generator(0, self.num_items) # gen_update is used for doing mutation for 1/2th of uploaded data self.gen_update = self.get_doc_generator(0, (self.num_items / 2)) self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) self.log.info("==========Finished rebalance base setup========")
def setUp(self): super(CollectionBase, self).setUp() self.log_setup_status("CollectionBase", "started") self.MAX_SCOPES = CbServer.max_scopes self.MAX_COLLECTIONS = CbServer.max_collections self.key = 'test_collection'.rjust(self.key_size, '0') self.simulate_error = self.input.param("simulate_error", None) self.error_type = self.input.param("error_type", "memory") self.doc_ops = self.input.param("doc_ops", None) self.spec_name = self.input.param("bucket_spec", "single_bucket.default") self.data_spec_name = self.input.param("data_spec_name", "initial_load") self.remove_default_collection = \ self.input.param("remove_default_collection", False) self.action_phase = self.input.param("action_phase", "before_default_load") self.skip_collections_cleanup = \ self.input.param("skip_collections_cleanup", False) self.validate_docs_count_during_teardown = \ self.input.param("validate_docs_count_during_teardown", False) self.batch_size = self.input.param("batch_size", 200) self.vbuckets = self.input.param("vbuckets", self.cluster_util.vbuckets) self.retry_get_process_num = self.input.param("retry_get_process_num", 25) self.crud_batch_size = 100 self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), self.durability_level) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") self.bucket_helper_obj = BucketHelper(self.cluster.master) try: self.collection_setup() except Java_base_exception as exception: self.handle_setup_exception(exception) except Exception as exception: self.handle_setup_exception(exception) self.supported_d_levels = \ self.bucket_util.get_supported_durability_levels() self.log_setup_status("CollectionBase", "complete")
def test_with_sync_write(self): cluster_node = choice(self.kv_nodes) target_vb_type, simulate_error = \ DurabilityHelper.get_vb_and_error_type(self.durability_level) doc_gen = doc_generator( self.key, 0, 2, target_vbucket=self.node_data[cluster_node]["%s_vbs" % target_vb_type]) client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) key_1, value_1 = doc_gen.next() key_2, value_2 = doc_gen.next() if self.doc_ops[0] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_1, value_1) if self.doc_ops[1] != DocLoading.Bucket.DocOps.CREATE: client.crud(DocLoading.Bucket.DocOps.CREATE, key_2, value_2) sync_op = Thread(target=self.crud, args=[client, self.doc_ops[0], key_1], kwargs={ "value": value_1, "durability": self.durability_level, "expected_thread_val": 1 }) async_op = Thread(target=self.crud, args=[client, self.doc_ops[1], key_2], kwargs={ "value": value_2, "expected_thread_val": 0 }) cb_err = CouchbaseError(self.log, self.node_data[cluster_node]["shell"]) cb_err.create(simulate_error, self.bucket.name) # Start doc_ops sync_op.start() self.sleep(1, "Wait before async operation") async_op.start() # Wait for ops to complete async_op.join() cb_err.revert(simulate_error, self.bucket.name) sync_op.join() self.validate_test_failure()
def test_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops", "create;create").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() # Acquire SDK client from the pool for performing doc_ops locally client = SDKClient([self.cluster.master], self.bucket) target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = replica_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(replica_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.REPLACE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(3, "Wait for error simulation to take effect") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): client.select_collection(s_name, c_name) for op_type in c_meta: key, value = c_meta[op_type]["doc_gen"].next() if self.with_non_sync_writes: fail = client.crud(doc_ops[1], key, value, exp=0, timeout=2, time_unit="seconds") else: fail = client.crud( doc_ops[1], key, value, exp=0, durability=self.durability_level, timeout=2, time_unit="seconds") expected_exception = \ SDKException.AmbiguousTimeoutException retry_reason = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE \ and doc_ops[1] in \ [DocLoading.Bucket.DocOps.DELETE, DocLoading.Bucket.DocOps.REPLACE]: expected_exception = \ SDKException.DocumentNotFoundException retry_reason = None # Validate the returned error from the SDK if expected_exception not in str(fail["error"]): self.log_failure("Invalid exception for %s: %s" % (key, fail["error"])) if retry_reason \ and retry_reason not in str(fail["error"]): self.log_failure( "Invalid retry reason for %s: %s" % (key, fail["error"])) # Try reading the value in SyncWrite state fail = client.crud("read", key) if doc_ops[0] == "create": # Expected KeyNotFound in case of CREATE op if fail["status"] is True: self.log_failure( "%s returned value during SyncWrite %s" % (key, fail)) else: # Expects prev val in case of other operations if fail["status"] is False: self.log_failure( "Key %s read failed for prev value: %s" % (key, fail)) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Release the acquired SDK client client.close() self.validate_test_failure()
def setUp(self): super(DurabilityTestsBase, self).setUp() self.simulate_error = self.input.param("simulate_error", None) self.error_type = self.input.param("error_type", "memory") self.doc_ops = self.input.param("doc_ops", None) self.with_non_sync_writes = self.input.param("with_non_sync_writes", False) self.skip_init_load = self.input.param("skip_init_load", False) self.crud_batch_size = 100 self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), self.durability_level) # Initialize cluster using given nodes nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master]+nodes_init) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Create default bucket and add rbac user self.bucket_util.create_default_bucket( replica=self.num_replicas, compression_mode=self.compression_mode, bucket_type=self.bucket_type, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy) self.bucket_util.add_rbac_user() self.cluster_util.print_cluster_stats() self.bucket = self.bucket_util.buckets[0] # Create sdk_clients for pool if self.sdk_client_pool: self.log.info("Creating SDK client pool") self.sdk_client_pool.create_clients( self.bucket, self.cluster.nodes_in_cluster, req_clients=self.sdk_pool_capacity, compression_settings=self.sdk_compression) if not self.skip_init_load: if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") doc_create = doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) self.log.info("Loading {0} items into bucket" .format(self.num_items)) task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_create, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.bucket_util.print_bucket_stats() self.log.info("=== DurabilityBaseTests setup complete ===")
class BucketDurabilityBase(BaseTestCase): def setUp(self): super(BucketDurabilityBase, self).setUp() if len(self.cluster.servers) < self.nodes_init: self.fail("Not enough nodes for rebalance") # Rebalance-in required nodes for testing nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master]+nodes_init) # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") self.bucket_util.add_rbac_user() self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster)) self.kv_nodes = self.cluster_util.get_kv_nodes() self.num_nodes_affected = 1 if self.num_replicas > 1: self.num_nodes_affected = 2 # Bucket create options representation self.bucket_template = dict() self.bucket_template[Bucket.name] = "default" self.bucket_template[Bucket.ramQuotaMB] = 100 self.bucket_template[Bucket.replicaNumber] = self.num_replicas # These two params will be set during each iteration self.bucket_template[Bucket.bucketType] = None self.bucket_template[Bucket.durabilityMinLevel] = None # Print cluster stats self.cluster_util.print_cluster_stats() self.bucket_types_to_test = [Bucket.Type.MEMBASE, Bucket.Type.EPHEMERAL, Bucket.Type.MEMCACHED] self.d_level_order = [ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY, Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY] # Dict representing the possible levels supported by each bucket type self.possible_d_levels = dict() self.possible_d_levels[Bucket.Type.MEMBASE] = \ self.bucket_util.get_supported_durability_levels() self.possible_d_levels[Bucket.Type.EPHEMERAL] = [ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY] self.possible_d_levels[Bucket.Type.MEMCACHED] = [ Bucket.DurabilityLevel.NONE] # Dict to store the list of active/replica VBs in each node self.vbs_in_node = dict() for node in self.cluster_util.get_kv_nodes(): shell = RemoteMachineShellConnection(node) self.vbs_in_node[node] = dict() self.vbs_in_node[node]["shell"] = shell self.log.info("===== BucketDurabilityBase setup complete =====") def tearDown(self): # Close all shell_connections opened in setUp() for node in self.vbs_in_node: self.vbs_in_node[node]["shell"].disconnect() super(BucketDurabilityBase, self).tearDown() self.summary.display() self.validate_test_failure() @staticmethod def get_cb_stat_verification_dict(): verification_dict = dict() verification_dict["ops_create"] = 0 verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 return verification_dict def get_vbucket_type_mapping(self, bucket_name): for node in self.vbs_in_node.keys(): cb_stat = Cbstats(self.vbs_in_node[node]["shell"]) self.vbs_in_node[node]["active"] = \ cb_stat.vbucket_list(bucket_name, "active") self.vbs_in_node[node]["replica"] = \ cb_stat.vbucket_list(bucket_name, "replica") def get_bucket_dict(self, bucket_type, bucket_durability): bucket_dict = deepcopy(self.bucket_template) bucket_dict[Bucket.bucketType] = bucket_type bucket_dict[Bucket.durabilityMinLevel] = \ BucketDurability[bucket_durability] return bucket_dict def get_supported_durability_for_bucket(self): if self.bucket_type == Bucket.Type.EPHEMERAL: return [Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY] return self.bucket_util.get_supported_durability_levels() def validate_durability_with_crud( self, bucket, bucket_durability, verification_dict, doc_start_index=0, num_items_to_load=1, op_type="create", doc_durability=Bucket.DurabilityLevel.NONE): """ Common API to validate durability settings of the bucket is set correctly or not. :param bucket: Bucket object to validate :param bucket_durability: Durability set for the bucket Note: Need this because the string within the bucket object is different than this. :param verification_dict: To hold the values for req cbstats to verify :param doc_start_index: Starting index to be considered for doc_load :param num_items_to_load: Number of items to be loaded to test. Default is '1' :param op_type: Type of CRUD to perform. Default is 'create' :param doc_durability: Document durability level to use during CRUD. Default level is 'None' :return: """ def get_d_level_used(): if self.d_level_order.index(bucket_durability) \ < self.d_level_order.index(doc_durability): return doc_durability return bucket_durability d_level_to_test = get_d_level_used() # Nothing to test for durability_level=None (async_write case) if d_level_to_test == Bucket.DurabilityLevel.NONE: return self.log.info("Performing %s operation to validate d_level %s" % (op_type, d_level_to_test)) # Can't simulate error conditions for all durability_levels. # So only perform CRUD without error_sim if len(self.vbs_in_node.keys()) > 1: # Pick a random node to perform error sim and load random_node = choice(self.vbs_in_node.keys()) target_vb_type, simulate_error = \ self.durability_helper.get_vb_and_error_type(d_level_to_test) doc_gen = doc_generator( self.key, doc_start_index, num_items_to_load, target_vbucket=self.vbs_in_node[random_node][target_vb_type]) error_sim = CouchbaseError(self.log, self.vbs_in_node[random_node]["shell"]) doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=doc_durability, timeout_secs=32, batch_size=1, skip_read_on_error=True, suppress_error_table=True, start_task=False, sdk_client_pool=self.sdk_client_pool) self.sleep(5, "Wait for sdk_client to get warmed_up") # Simulate target error condition error_sim.create(simulate_error) self.sleep(5, "Wait for error_sim to take effect") # Start doc_loading task and wait for it to complete self.task_manager.add_new_task(doc_load_task) self.task_manager.get_task_result(doc_load_task) # Revert the induced error condition self.sleep(5, "Wait before reverting error_simulation") error_sim.revert(simulate_error) # Validate failed doc count and exception type from SDK if not doc_load_task.fail.keys(): self.log_failure("Docs inserted without honoring the " "bucket durability level") for key, result in doc_load_task.fail.items(): if SDKException.DurabilityAmbiguousException \ not in str(result["error"]): self.log_failure("Invalid exception for key %s " "during %s operation: %s" % (key, op_type, result["error"])) verification_dict["sync_write_aborted_count"] += num_items_to_load else: doc_gen = doc_generator(self.key, doc_start_index, doc_start_index+num_items_to_load) # Retry the same CRUDs without any error simulation in place doc_load_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, op_type, exp=self.maxttl, durability=doc_durability, timeout_secs=2, batch_size=1, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(doc_load_task) if doc_load_task.fail: self.log_failure("Failures seen during CRUD without " "error simulation. Keys failed: %s" % doc_load_task.fail.keys()) else: verification_dict["ops_%s" % op_type] += \ num_items_to_load verification_dict["sync_write_committed_count"] += \ num_items_to_load def getTargetNodes(self): def select_randam_node(nodes): rand_node_index = randint(1, self.nodes_init-1) if self.cluster.nodes_in_cluster[rand_node_index] not in node_list: nodes.append(self.cluster.nodes_in_cluster[rand_node_index]) node_list = list() if len(self.cluster.nodes_in_cluster) > 1: # Choose random nodes, if the cluster is not a single node cluster while len(node_list) != self.num_nodes_affected: select_randam_node(node_list) else: node_list.append(self.cluster.master) return node_list def cb_stat_verify(self, verification_dict): failed = self.durability_helper.verify_vbucket_details_stats( self.bucket_util.buckets[0], self.kv_nodes, vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.log_failure("Cbstat vbucket-details validation failed") self.summary.add_step("Cbstat vb-details validation")
def test_index_with_aborts(self): """ 1. Create index (2i/view) on default bucket 2. Load multiple docs such that all sync_writes will be aborted 3. Verify nothing went into indexing 4. Load sync_write docs such that they are successful 5. Validate the mutated docs are taken into indexing :return: """ crud_batch_size = 50 def_bucket = self.cluster.buckets[0] kv_nodes = self.cluster_util.get_kv_nodes(self.cluster) replica_vbs = dict() verification_dict = dict() index_item_count = dict() expected_num_indexed = dict() load_gen = dict() load_gen["ADD"] = dict() load_gen["SET"] = dict() partial_aborts = ["initial_aborts", "aborts_at_end"] durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) if self.create_index_during == "before_doc_ops": self.create_gsi_indexes(def_bucket) curr_items = self.bucket_util.get_bucket_current_item_count( self.cluster, def_bucket) if self.sync_write_abort_pattern in ["all_aborts", "initial_aborts"]: self.bucket_util.flush_bucket(self.cluster, def_bucket) self.num_items = 0 else: self.num_items = curr_items self.log.info("Disabling auto_failover to avoid node failures") status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120) self.assertTrue(status, msg="Failure during disabling auto-failover") # Validate vbucket stats verification_dict["ops_create"] = self.num_items verification_dict["ops_update"] = 0 # verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 index_item_count["#primary"] = self.num_items index_item_count["durable_add_aborts"] = 0 index_item_count["durable_set_aborts"] = 0 expected_num_indexed["#primary"] = curr_items expected_num_indexed["durable_add_aborts"] = 0 expected_num_indexed["durable_set_aborts"] = 0 if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Loading docs such that all sync_writes will be aborted") for server in kv_nodes: ssh_shell = RemoteMachineShellConnection(server) cbstats = Cbstats(server) replica_vbs[server] = cbstats.vbucket_list(def_bucket.name, "replica") load_gen["ADD"][server] = list() load_gen["ADD"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) if self.sync_write_abort_pattern in partial_aborts: load_gen["ADD"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="ADD")) verification_dict["ops_create"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size index_item_count["#primary"] += crud_batch_size index_item_count["durable_add_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["ADD"][server], self.cluster, def_bucket, self.durability_level, DocLoading.Bucket.DocOps.CREATE, self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") verification_dict["sync_write_aborted_count"] += \ crud_batch_size if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) load_gen["SET"][server] = list() load_gen["SET"][server].append( doc_generator(self.key, 0, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) if self.sync_write_abort_pattern in partial_aborts: load_gen["SET"][server].append( doc_generator(self.key, 10000, crud_batch_size, target_vbucket=replica_vbs[server], mutation_type="SET")) verification_dict["ops_update"] += crud_batch_size verification_dict["sync_write_committed_count"] += \ crud_batch_size index_item_count["durable_add_aborts"] -= crud_batch_size index_item_count["durable_set_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size expected_num_indexed["durable_set_aborts"] += crud_batch_size verification_dict["sync_write_aborted_count"] += \ crud_batch_size task_success = self.bucket_util.load_durable_aborts( ssh_shell, load_gen["SET"][server], self.cluster, def_bucket, self.durability_level, DocLoading.Bucket.DocOps.UPDATE, self.sync_write_abort_pattern) if not task_success: self.log_failure("Failure during load_abort task") ssh_shell.disconnect() if self.create_index_during == "before_doc_ops": self.validate_indexed_doc_count(def_bucket, index_item_count) failed = durability_helper.verify_vbucket_details_stats( def_bucket, kv_nodes, vbuckets=self.cluster.vbuckets, expected_val=verification_dict) if failed: self.log_failure("Cbstat vbucket-details verification failed") self.validate_test_failure() if self.create_index_during == "after_doc_ops": self.create_gsi_indexes(def_bucket) self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Verify aborts are not indexed") self.validate_indexed_count_from_stats(def_bucket, expected_num_indexed, index_item_count) if not self.use_gsi_for_primary: self.log.info("Wait of any indexing_activity to complete") index_monitor_task = self.cluster_util.async_monitor_active_task( self.cluster.master, "indexer", "_design/ddl_#primary", num_iteration=20, wait_task=True)[0] self.task_manager.get_task_result(index_monitor_task) self.assertTrue(index_monitor_task.result, "Indexer task still running on server") for server in kv_nodes: if self.sync_write_abort_pattern == "initial_aborts": load_gen["ADD"][server] = load_gen["ADD"][server][:1] load_gen["SET"][server] = load_gen["SET"][server][:1] elif self.sync_write_abort_pattern == "aborts_at_end": load_gen["ADD"][server] = load_gen["ADD"][server][-1:] load_gen["SET"][server] = load_gen["SET"][server][-1:] self.log.info("Load sync_write docs such that they are successful") for server in kv_nodes: for gen_load in load_gen["ADD"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") index_item_count["#primary"] += crud_batch_size index_item_count["durable_add_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size self.validate_indexed_doc_count(def_bucket, index_item_count) for gen_load in load_gen["SET"][server]: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "update", 0, batch_size=50, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 0: self.log_failure("Some failures seen during doc_ops") index_item_count["durable_add_aborts"] -= crud_batch_size index_item_count["durable_set_aborts"] += crud_batch_size expected_num_indexed["#primary"] += crud_batch_size expected_num_indexed["durable_add_aborts"] += crud_batch_size expected_num_indexed["durable_set_aborts"] += crud_batch_size self.validate_indexed_doc_count(def_bucket, index_item_count) self.log.info("Validate the mutated docs are taken into indexing") self.validate_indexed_count_from_stats(def_bucket, expected_num_indexed, index_item_count) self.validate_test_failure()
def setUp(self): super(CrashTest, self).setUp() self.doc_ops = self.input.param("doc_ops", None) self.process_name = self.input.param("process", None) self.service_name = self.input.param("service", "data") self.sig_type = self.input.param("sig_type", "SIGKILL").upper() self.target_node = self.input.param("target_node", "active") self.client_type = self.input.param("client_type", "sdk").lower() self.N1qltxn = self.input.param("N1qltxn", False) self.pre_warmup_stats = dict() self.timeout = 120 self.new_docs_to_add = 10000 if self.doc_ops is not None: self.doc_ops = self.doc_ops.split(";") if not self.atomicity: self.durability_helper = DurabilityHelper( self.log, self.nodes_init, durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) verification_dict = dict() verification_dict["ops_create"] = \ self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = \ verification_dict["ops_create"] # Load initial documents into the buckets transaction_gen_create = doc_generator( "transaction_key", 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) gen_create = doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) if self.atomicity: transaction_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.cluster.buckets, transaction_gen_create, DocLoading.Bucket.DocOps.CREATE, exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) self.task.jython_task_manager.get_task_result(transaction_task) for bucket in self.cluster.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_create, DocLoading.Bucket.DocOps.CREATE, self.maxttl, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, batch_size=10, process_concurrency=8) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets) self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items += self.num_items verification_dict["ops_create"] += self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] += \ self.num_items # Verify cbstats vbucket-details stats_failed = self.durability_helper.verify_vbucket_details_stats( bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if self.atomicity is False: if stats_failed: self.fail("Cbstats verification failed") self.bucket_util.verify_stats_all_buckets( self.cluster, self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items) self.bucket = self.cluster.buckets[0] if self.N1qltxn: self.n1ql_server = self.cluster_util.get_nodes_from_services_map( service_type="n1ql", get_all_nodes=True) self.n1ql_helper = N1QLHelper(server=self.n1ql_server, use_rest=True, buckets=self.cluster.buckets, log=self.log, scan_consistency='REQUEST_PLUS', num_collection=3, num_buckets=1, num_savepoints=1, override_savepoint=False, num_stmt=10, load_spec=self.data_spec_name) self.bucket_col = self.n1ql_helper.get_collections() self.stmts = self.n1ql_helper.get_stmt(self.bucket_col) self.stmts = self.n1ql_helper.create_full_stmts(self.stmts) self.log.info("==========Finished CrashTest setup========")
def setUp(self): super(CrashTest, self).setUp() self.doc_ops = self.input.param("doc_ops", None) self.process_name = self.input.param("process", None) self.service_name = self.input.param("service", "data") self.sig_type = self.input.param("sig_type", "SIGKILL").upper() self.target_node = self.input.param("target_node", "active") self.pre_warmup_stats = {} self.timeout = 120 self.new_docs_to_add = 10000 if self.doc_ops is not None: self.doc_ops = self.doc_ops.split(";") nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) if not self.atomicity: self.durability_helper = DurabilityHelper( self.log, self.nodes_init, durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) self.bucket_util.create_default_bucket( bucket_type=self.bucket_type, ram_quota=self.bucket_size, replica=self.num_replicas, compression_mode="off", storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy) self.bucket_util.add_rbac_user() if self.sdk_client_pool: self.log.info("Creating SDK clients for client_pool") for bucket in self.bucket_util.buckets: self.sdk_client_pool.create_clients( bucket, [self.cluster.master], self.sdk_pool_capacity, compression_settings=self.sdk_compression) verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = self.num_items # Load initial documents into the buckets self.log.info("Loading initial documents") gen_create = doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_create, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) self.task.jython_task_manager.get_task_result(task) else: for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_create, DocLoading.Bucket.DocOps.CREATE, self.maxttl, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, batch_size=10, process_concurrency=8, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() # Verify cbstats vbucket-details stats_failed = \ self.durability_helper.verify_vbucket_details_stats( bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if stats_failed: self.fail("Cbstats verification failed") self.bucket_util.verify_stats_all_buckets(self.num_items) self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.log.info("==========Finished CrashTest setup========")
def test_with_persistence_issues(self): """ Test to make sure timeout is handled in durability calls and document CRUDs are successful even with disk related failures 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout value is considered as 'seconds' """ if self.durability_level in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim = DiskError(self.log, self.task_manager, self.cluster.master, target_nodes, 60, 0, False, 120, disk_location="/data") error_sim.create(action=self.simulate_error) else: for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform CRUDs with induced error scenario is active load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") if self.simulate_error \ in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: error_sim.revert(self.simulate_error) else: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats got updated" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
def test_collection_not_exists(self): """ 1. Load docs into required collection 2. Validate docs based on the targeted collection 3. Create non-default scope/collection for CRUDs to happen 4. Perform doc_ops again and perform CRUDs 5. Drop the target collection and validate the CollectionNotExists exception from client side 6. Recreate non-default collection and re-create the docs and validate """ def validate_vb_detail_stats(): failed = durability_helper.verify_vbucket_details_stats( self.bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.log_failure("vBucket_details validation failed") self.bucket_util.validate_docs_per_collections_all_buckets() num_cols_in_bucket = 0 for _, scope in self.bucket.scopes.items(): for _, _ in scope.collections.items(): num_cols_in_bucket += 1 verification_dict = dict() verification_dict["ops_create"] = num_cols_in_bucket * self.num_items verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 durability_helper = DurabilityHelper(self.log, len(self.cluster.kv_nodes), durability=self.durability_level) drop_scope = self.input.param("drop_scope", False) if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() if self.collection_name != CbServer.default_collection: self.collection_name = self.bucket_util.get_random_name() # Doc generator used for mutations doc_gen = doc_generator("test_col_not_exists", 0, 10) # Acquire SDK client for mutations client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) doc_ttl, _ = \ SDKExceptionTests.__get_random_doc_ttl_and_durability_level() self.log.info( "Creating docs with doc_ttl %s into %s:%s:%s" % (doc_ttl, self.bucket.name, self.scope_name, self.collection_name)) while doc_gen.has_next(): key, value = doc_gen.next() result = client.crud("create", key, value, exp=doc_ttl, durability=self.durability_level, timeout=30) if self.collection_name == CbServer.default_collection: if result["status"] is False: self.log_failure("Create doc failed for key: %s" % key) else: verification_dict["ops_create"] += 1 if self.durability_level: verification_dict["sync_write_committed_count"] += 1 self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items += 1 elif result["status"] is True: self.log_failure("Create didn't fail as expected for key: %s" % key) elif SDKException.AmbiguousTimeoutException \ not in str(result["error"]) \ or SDKException.RetryReason.COLLECTION_NOT_FOUND \ not in str(result["error"]): self.log_failure("Invalid exception for key %s: %s" % (key, result["error"])) validate_vb_detail_stats() # Create required scope/collection for successful CRUD operation self.create_scope_collection() # Reset doc_gen itr value for retry purpose doc_gen.reset() doc_ttl, _ = \ SDKExceptionTests.__get_random_doc_ttl_and_durability_level() self.log.info( "Creating docs with doc_ttl %s into %s:%s:%s" % (doc_ttl, self.bucket.name, self.scope_name, self.collection_name)) op_type = "create" if self.collection_name == CbServer.default_collection: op_type = "update" while doc_gen.has_next(): key, value = doc_gen.next() result = client.crud(op_type, key, value, exp=doc_ttl, durability=self.durability_level) if result["status"] is False: self.log_failure("Create fail for key %s: %s" % (key, result)) else: if op_type == "create": verification_dict["ops_create"] += 1 self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items += 1 else: verification_dict["ops_update"] += 1 if self.durability_level: verification_dict["sync_write_committed_count"] += 1 validate_vb_detail_stats() self.validate_test_failure() if drop_scope: self.log.info("Dropping scope %s" % self.scope_name) self.bucket_util.drop_scope(self.cluster.master, self.bucket, self.scope_name) else: self.log.info("Dropping collection %s:%s" % (self.scope_name, self.collection_name)) self.bucket_util.drop_collection(self.cluster.master, self.bucket, self.scope_name, self.collection_name) validate_vb_detail_stats() self.validate_test_failure() # Reset doc_gen itr value for retry purpose doc_gen.reset() while doc_gen.has_next(): key, value = doc_gen.next() result = client.crud("create", key, value, exp=doc_ttl, durability=self.durability_level) if result["status"] is True: self.log_failure("Create doc succeeded for dropped collection") validate_vb_detail_stats() self.validate_test_failure() # Re-create the dropped collection self.create_scope_collection(create_scope=drop_scope) if self.collection_name != CbServer.default_collection: doc_gen.reset() while doc_gen.has_next(): key, value = doc_gen.next() result = client.crud("create", key, value, exp=doc_ttl, durability=self.durability_level) if result["status"] is False: self.log_failure("Create failed after collection recreate " "for key %s: %s" % (key, result["error"])) else: verification_dict["ops_create"] += 1 if self.durability_level: verification_dict["sync_write_committed_count"] += 1 self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items += 1 validate_vb_detail_stats() # Release the acquired client self.sdk_client_pool.release_client(client) self.validate_test_failure()
def test_timeout_with_crud_failures(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operations succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ # Local methods to validate vb_seqno def compare_vb_stat(stat_1, stat_2, vb, comparison="!="): keys_to_check = ["high_seqno", "high_completed_seqno"] result = True for key in keys_to_check: if vb in stat_1.keys(): if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]: self.log_failure( "Mismatch in vb-%s UUID. %s != %s" % (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"])) if comparison == "!=": if stat_1[vb][key] != stat_2[vb][key]: result = False self.log.warning( "Mismatch in vb-%s stat %s. %s != %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) elif stat_1[vb][key] == stat_2[vb][key]: result = False self.log.warning( "Stat not updated for vb-%s stat %s. " "%s == %s" % (vb, key, stat_1[vb][key], stat_2[vb][key])) return result def validate_vb_seqno_stats(): """ :return retry_validation: Boolean denoting to retry validation """ retry_validation = False vb_info["post_timeout"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) for tem_vb_num in range(self.cluster_util.vbuckets): tem_vb_num = str(tem_vb_num) if tem_vb_num not in affected_vbs: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log_failure("Unaffected vb-%s stat" % tem_vb_num) elif int(tem_vb_num) in target_nodes_vbuckets["active"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num) is False: self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "active", tem_vb_num)) elif int(tem_vb_num) in target_nodes_vbuckets["replica"]: if compare_vb_stat(vb_info["init"][node.ip], vb_info["post_timeout"][node.ip], tem_vb_num, comparison="==") is False: retry_validation = True self.log.warning("%s - mismatch in %s vb-%s seq_no" % (node.ip, "replica", tem_vb_num)) return retry_validation shell_conn = dict() cbstat_obj = dict() error_sim = dict() target_nodes_vbuckets = dict() vb_info = dict() tasks = dict() doc_gen = dict() affected_vbs = list() target_nodes_vbuckets["active"] = [] target_nodes_vbuckets["replica"] = [] vb_info["init"] = dict() vb_info["post_timeout"] = dict() vb_info["afterCrud"] = dict() # Override crud_batch_size to minimum value for testing self.crud_batch_size = 5 self.key = "test_collections" self.sdk_timeout = 3 # Select target vbucket type to load_docs target_vb_type = "replica" if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \ and self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vb_type = "active" # Create required scope/collection for successful CRUD operation if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.collection_name = self.bucket_util.get_random_name() self.log.info("Creating scope::collection %s::%s" % (self.scope_name, self.collection_name)) self.create_scope_collection() # Load docs into created collection self.log.info("Loading data into created collection") load_gen = doc_generator(self.key, 0, self.num_items) task = self.task.async_load_gen_docs( self.cluster, self.bucket, load_gen, "create", 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=200, process_concurrency=8, timeout_secs=60) self.task_manager.get_task_result(task) if self.subdoc_test: load_gen = sub_doc_generator(self.key, 0, self.num_items / 2) task = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, load_gen, Bucket_Op.SubDocOps.INSERT, timeout_secs=self.sdk_timeout, compression=self.sdk_compression, path_create=True, batch_size=100, process_concurrency=8, durability=self.durability_level, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task_manager.get_task_result(task) self.bucket.scopes[self.scope_name].collections[ self.collection_name].num_items = self.num_items target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) target_nodes_vbuckets["active"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="active") target_nodes_vbuckets["replica"] += \ cbstat_obj[node.ip].vbucket_list(self.bucket.name, vbucket_type="replica") vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) curr_time = int(time.time()) expected_timeout = curr_time + self.sdk_timeout if target_vb_type == "active": target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["replica"]))) else: target_vbs = list( set(target_nodes_vbuckets[target_vb_type]).difference( set(target_nodes_vbuckets["active"]))) # Create required doc_generators doc_gen["create"] = doc_generator(self.key, self.num_items, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["delete"] = doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) doc_gen["read"] = doc_generator(self.key, int(self.num_items / 3), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["update"] = doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) # Create required subdoc generators doc_gen["insert"] = sub_doc_generator(self.key, int(self.num_items / 2), self.crud_batch_size, target_vbucket=target_vbs) doc_gen["upsert"] = sub_doc_generator_for_edit( self.key, 0, self.crud_batch_size, template_index=1, target_vbucket=target_vbs) doc_gen["remove"] = sub_doc_generator(self.key, 0, self.crud_batch_size, target_vbucket=target_vbs) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error_simulation to take effect") ops_to_perform = [ Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE, Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE ] if self.subdoc_test: ops_to_perform = [ Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT, Bucket_Op.SubDocOps.REMOVE ] for op_type in ops_to_perform: self.log.info("Starting doc op %s" % op_type) if op_type in Bucket_Op.DOC_OPS: tasks[op_type] = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, suppress_error_table=True, print_ops_rate=False, skip_read_on_error=True) else: tasks[op_type] = self.task.async_load_gen_sub_docs( self.cluster, self.bucket, doc_gen[op_type], op_type, 0, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool, path_create=True, batch_size=1, process_concurrency=8, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False) self.task.jython_task_manager.get_task_result(tasks[op_type]) # Validate task failures if op_type == Bucket_Op.DocOps.READ: # Validation for read task if len(tasks[op_type].fail.keys()) != 0: self.log_failure("Read failed for few docs: %s" % tasks[op_type].fail.keys()) else: # Validation of CRUDs - Update / Create / Delete for doc_id, crud_result in tasks[op_type].fail.items(): vb_num = self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets) if SDKException.DurabilityAmbiguousException \ not in str(crud_result["error"]): self.log_failure( "Invalid exception for doc %s, vb %s: %s" % (doc_id, vb_num, crud_result)) # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Check whether the timeout triggered properly if int(time.time()) < expected_timeout: self.log_failure("Timed-out before expected time") for op_type in ops_to_perform: if op_type == Bucket_Op.DocOps.READ: continue while doc_gen[op_type].has_next(): doc_id, _ = doc_gen[op_type].next() affected_vbs.append( str( self.bucket_util.get_vbucket_num_for_key( doc_id, self.cluster_util.vbuckets))) affected_vbs = list(set(affected_vbs)) # Fetch latest stats and validate the seq_nos are not updated for node in target_nodes: retry_count = 0 max_retry = 3 while retry_count < max_retry: self.log.info("Trying to validate vbseq_no stats: %d" % (retry_count + 1)) retry_count += 1 retry_required = validate_vb_seqno_stats() if not retry_required: break self.sleep(5, "Sleep for vbseq_no stats to update") else: # This will be exited only if `break` condition is not met self.log_failure("validate_vb_seqno_stats verification failed") self.validate_test_failure() # Get SDK Client from client_pool sdk_client = self.sdk_client_pool.get_client_for_bucket( self.bucket, self.scope_name, self.collection_name) # Doc error validation for op_type in ops_to_perform: task = tasks[op_type] if self.nodes_init == 1 \ and op_type != Bucket_Op.DocOps.READ \ and len(task.fail.keys()) != (doc_gen[op_type].end - doc_gen[op_type].start): self.log_failure( "Failed keys %d are less than expected %d" % (len(task.fail.keys()), (doc_gen[op_type].end - doc_gen[op_type].start))) # Create table objects for display table_view = TableView(self.log.error) ambiguous_table_view = TableView(self.log.info) table_view.set_headers(["Key", "vBucket", "Exception"]) ambiguous_table_view.set_headers(["Key", "vBucket"]) # Iterate failed keys for validation for doc_key, doc_info in task.fail.items(): vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key) if SDKException.DurabilityAmbiguousException \ not in str(doc_info["error"]): table_view.add_row( [doc_key, vb_for_key, doc_info["error"]]) ambiguous_table_view.add_row([doc_key, str(vb_for_key)]) if op_type not in Bucket_Op.SUB_DOC_OPS: retry_success = \ self.durability_helper.retry_for_ambiguous_exception( sdk_client, op_type, doc_key, doc_info) if not retry_success: self.log_failure("%s failed in retry for %s" % (op_type, doc_key)) # Display the tables (if any errors) table_view.display("Unexpected exception during %s" % op_type) ambiguous_table_view.display("D_Ambiguous exception during %s" % op_type) # Release the acquired client self.sdk_client_pool.release_client(sdk_client) # Verify doc count after expected CRUD failure self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() # Fetch latest stats and validate the values are updated for node in target_nodes: vb_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]: self.log_failure("vBucket seq_no stats not updated") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure()
def test_timeout_with_successful_crud(self): """ Test to make sure timeout is handled in durability calls and no documents are loaded when durability cannot be met using error simulation in server node side. This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify no operation succeeds 4. Revert the error scenario from the cluster to resume durability 5. Validate all mutations are succeeded after reverting the error condition Note: self.sdk_timeout values is considered as 'seconds' """ shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() vb_info["init"] = dict() vb_info["afterCrud"] = dict() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) doc_load_spec = dict() doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = self.sdk_timeout doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] = \ "test_collections" doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 0 doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 0 ops_to_perform = ["create", "update", "read", "replace", "delete"] if self.subdoc_test: ops_to_perform = ["insert", "upsert", "remove"] for op_type in ops_to_perform: self.log.info("Performing '%s' with timeout=%s" % (op_type, self.sdk_timeout)) curr_spec = deepcopy(doc_load_spec) if op_type == "create": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "update": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "delete": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] \ = 5 elif op_type == "read": curr_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 5 curr_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [ SDKException.TimeoutException ] elif op_type == "insert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 5 elif op_type == "upsert": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 5 elif op_type == "remove": curr_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 5 doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, curr_spec, mutation_num=1, async_load=True, validate_task=False) # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(10, "Wait before reverting the error condition") # Revert the specified error scenario for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.fail("Doc_loading for '%s' failed" % op_type) # Fetch latest stats and validate the values are updated for node in target_nodes: curr_stat = cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) if vb_info["init"][node.ip] == curr_stat: self.log_failure("vbucket_seqno not updated. %s == %s" % (vb_info["init"][node.ip], curr_stat)) # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() self.validate_test_failure()
def setUp(self): super(basic_ops, self).setUp() self.doc_ops = self.input.param("doc_ops", "").split(";") self.observe_test = self.input.param("observe_test", False) # Scope/collection name can be default or create a random one to test self.scope_name = self.input.param("scope", CbServer.default_scope) self.collection_name = self.input.param("collection", CbServer.default_collection) nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.bucket_util.create_default_bucket( replica=self.num_replicas, compression_mode=self.compression_mode, bucket_type=self.bucket_type, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy) self.bucket_util.add_rbac_user() # Create Scope/Collection with random names if not equal to default if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.bucket_util.create_scope(self.cluster.master, self.bucket_util.buckets[0], {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = self.bucket_util.get_random_name() self.bucket_util.create_collection( self.cluster.master, self.bucket_util.buckets[0], self.scope_name, { "name": self.collection_name, "num_items": self.num_items }) self.log.info("Using scope::collection - '%s::%s'" % (self.scope_name, self.collection_name)) # Update required num_items under default collection self.bucket_util.buckets[0] \ .scopes[self.scope_name] \ .collections[self.collection_name] \ .num_items = self.num_items self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) # Create sdk_clients for pool if self.sdk_client_pool: self.log.info("Creating SDK client pool") self.sdk_client_pool.create_clients( self.bucket_util.buckets[0], self.cluster.nodes_in_cluster, req_clients=self.sdk_pool_capacity, compression_settings=self.sdk_compression) # Reset active_resident_threshold to avoid further data load as DGM self.active_resident_threshold = 0 self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.log.info("==========Finished Basic_ops base setup========")
class basic_ops(BaseTestCase): def setUp(self): super(basic_ops, self).setUp() self.doc_ops = self.input.param("doc_ops", "").split(";") self.observe_test = self.input.param("observe_test", False) # Scope/collection name can be default or create a random one to test self.scope_name = self.input.param("scope", CbServer.default_scope) self.collection_name = self.input.param("collection", CbServer.default_collection) nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.bucket_util.create_default_bucket( replica=self.num_replicas, compression_mode=self.compression_mode, bucket_type=self.bucket_type, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy) self.bucket_util.add_rbac_user() # Create Scope/Collection with random names if not equal to default if self.scope_name != CbServer.default_scope: self.scope_name = self.bucket_util.get_random_name() self.bucket_util.create_scope(self.cluster.master, self.bucket_util.buckets[0], {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = self.bucket_util.get_random_name() self.bucket_util.create_collection( self.cluster.master, self.bucket_util.buckets[0], self.scope_name, { "name": self.collection_name, "num_items": self.num_items }) self.log.info("Using scope::collection - '%s::%s'" % (self.scope_name, self.collection_name)) # Update required num_items under default collection self.bucket_util.buckets[0] \ .scopes[self.scope_name] \ .collections[self.collection_name] \ .num_items = self.num_items self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) # Create sdk_clients for pool if self.sdk_client_pool: self.log.info("Creating SDK client pool") self.sdk_client_pool.create_clients( self.bucket_util.buckets[0], self.cluster.nodes_in_cluster, req_clients=self.sdk_pool_capacity, compression_settings=self.sdk_compression) # Reset active_resident_threshold to avoid further data load as DGM self.active_resident_threshold = 0 self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.log.info("==========Finished Basic_ops base setup========") def tearDown(self): super(basic_ops, self).tearDown() def do_basic_ops(self): KEY_NAME = 'key1' KEY_NAME2 = 'key2' self.log.info('Starting basic ops') default_bucket = self.bucket_util.get_all_buckets()[0] sdk_client = SDKClient([self.cluster.master], default_bucket, compression_settings=self.sdk_compression) # mcd = client.memcached(KEY_NAME) # MB-17231 - incr with full eviction rc = sdk_client.incr(KEY_NAME, delta=1) self.log.info('rc for incr: {0}'.format(rc)) # MB-17289 del with meta rc = sdk_client.set(KEY_NAME, 0, 0, json.dumps({'value': 'value2'})) self.log.info('set is: {0}'.format(rc)) # cas = rc[1] # wait for it to persist persisted = 0 while persisted == 0: opaque, rep_time, persist_time, persisted, cas = \ sdk_client.observe(KEY_NAME) try: rc = sdk_client.evict_key(KEY_NAME) except MemcachedError as exp: self.fail("Exception with evict meta - {0}".format(exp)) CAS = 0xabcd try: # key, exp, flags, seqno, cas rc = mcd.del_with_meta(KEY_NAME2, 0, 0, 2, CAS) except MemcachedError as exp: self.fail("Exception with del_with meta - {0}".format(exp)) # Reproduce test case for MB-28078 def do_setWithMeta_twice(self): mc = MemcachedClient(self.cluster.master.ip, constants.memcached_port) mc.sasl_auth_plain(self.cluster.master.rest_username, self.cluster.master.rest_password) mc.bucket_select('default') try: mc.setWithMeta('1', '{"Hello":"World"}', 3600, 0, 1, 0x1512a3186faa0000) except MemcachedError as error: self.log.info("<MemcachedError #%d ``%s''>" % (error.status, error.message)) self.fail("Error on First setWithMeta()") stats = mc.stats() self.log.info('curr_items: {0} and curr_temp_items:{1}'.format( stats['curr_items'], stats['curr_temp_items'])) self.sleep(5, "Wait before checking the stats") stats = mc.stats() self.log.info('curr_items: {0} and curr_temp_items:{1}'.format( stats['curr_items'], stats['curr_temp_items'])) try: mc.setWithMeta('1', '{"Hello":"World"}', 3600, 0, 1, 0x1512a3186faa0000) except MemcachedError as error: stats = mc.stats() self.log.info('After 2nd setWithMeta(), curr_items: {} ' 'and curr_temp_items: {}'.format( stats['curr_items'], stats['curr_temp_items'])) if int(stats['curr_temp_items']) == 1: self.fail("Error on second setWithMeta(), " "expected curr_temp_items to be 0") else: self.log.info("<MemcachedError #%d ``%s''>" % (error.status, error.message)) def generate_docs_bigdata(self, docs_per_day, start=0, document_size=1024000): return doc_generator(self.key, start, docs_per_day, key_size=self.key_size, doc_size=document_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) def test_doc_size(self): def check_durability_failures(): self.log.error(task.sdk_acked_curd_failed.keys()) self.log.error(task.sdk_exception_crud_succeed.keys()) self.assertTrue( len(task.sdk_acked_curd_failed) == 0, "Durability failed for docs: %s" % task.sdk_acked_curd_failed.keys()) self.assertTrue( len(task.sdk_exception_crud_succeed) == 0, "Durability failed for docs: %s" % task.sdk_acked_curd_failed.keys()) """ Basic tests for document CRUD operations using JSON docs """ doc_op = self.input.param("doc_op", None) def_bucket = self.bucket_util.buckets[0] ignore_exceptions = list() retry_exceptions = list() supported_d_levels = self.bucket_util.get_supported_durability_levels() # Stat validation reference variables verification_dict = dict() verification_dict["ops_create"] = 0 verification_dict["ops_update"] = 0 verification_dict["ops_delete"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["sync_write_aborted_count"] = 0 verification_dict["sync_write_committed_count"] = 0 if self.target_vbucket and type(self.target_vbucket) is not list: self.target_vbucket = [self.target_vbucket] self.log.info("Creating doc_generator..") # Load basic docs into bucket doc_create = doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) self.log.info("Loading {0} docs into the bucket: {1}".format( self.num_items, def_bucket)) task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) if self.ryow: check_durability_failures() # Retry doc_exception code self.log.info("Validating failed doc's (if any) exceptions") doc_op_info_dict = dict() doc_op_info_dict[task] = self.bucket_util.get_doc_op_info_dict( def_bucket, "create", exp=0, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout, time_unit="seconds", ignore_exceptions=ignore_exceptions, retry_exceptions=retry_exceptions) self.bucket_util.verify_doc_op_task_exceptions(doc_op_info_dict, self.cluster, self.sdk_client_pool) if len(doc_op_info_dict[task]["unwanted"]["fail"].keys()) != 0: self.fail("Failures in retry doc CRUDs: {0}".format( doc_op_info_dict[task]["unwanted"]["fail"])) self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() # Update ref_val verification_dict["ops_create"] += \ self.num_items - len(task.fail.keys()) # Validate vbucket stats if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] += self.num_items failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") # Verify initial doc load count self.log.info("Validating doc_count in buckets") self.bucket_util.validate_doc_count_as_per_collections(def_bucket) self.log.info("Creating doc_generator for doc_op") num_item_start_for_crud = int(self.num_items / 2) doc_update = doc_generator(self.key, 0, num_item_start_for_crud, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, mutate=1, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) if self.target_vbucket: mutation_doc_count = len(doc_update.doc_keys) else: mutation_doc_count = (doc_update.end - doc_update.start + len(task.fail.keys())) if doc_op == "update": self.log.info("Performing 'update' mutation over the docs") task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) verification_dict["ops_update"] += mutation_doc_count if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] \ += mutation_doc_count if self.ryow: check_durability_failures() # Read all the values to validate update operation task = self.task.async_validate_docs( self.cluster, def_bucket, doc_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) elif doc_op == "delete": self.log.info("Performing 'delete' mutation over the docs") task = self.task.async_load_gen_docs( self.cluster, def_bucket, doc_update, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, ryow=self.ryow, check_persistence=self.check_persistence, scope=self.scope_name, collection=self.collection_name, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) if self.collection_name is None: target_scope = CbServer.default_scope target_collection = CbServer.default_collection else: target_scope = self.scope_name target_collection = self.collection_name def_bucket \ .scopes[target_scope] \ .collections[target_collection] \ .num_items -= (self.num_items - num_item_start_for_crud) verification_dict["ops_delete"] += mutation_doc_count if self.durability_level in supported_d_levels: verification_dict["sync_write_committed_count"] \ += mutation_doc_count if self.ryow: check_durability_failures() # Read all the values to validate delete operation task = self.task.async_validate_docs( self.cluster, def_bucket, doc_update, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) else: self.log.warning("Unsupported doc_operation") self.log.info("Wait for ep_all_items_remaining to become '0'") self.bucket_util._wait_for_stats_all_buckets() failed = self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if failed: self.fail("Cbstat vbucket-details verification failed") self.log.info("Validating doc_count") self.bucket_util.validate_doc_count_as_per_collections(def_bucket) def test_large_doc_size(self): # bucket size=256MB, when Bucket gets filled 236MB then # test starts failing document size=2MB, No of docs = 221, # load 250 docs generate docs with size >= 1MB , See MB-29333 self.doc_size *= 1024 * 1024 gens_load = self.generate_docs_bigdata(docs_per_day=self.num_items, document_size=self.doc_size) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gens_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) # check if all the documents(250) are loaded with default timeout self.bucket_util.verify_stats_all_buckets(self.num_items) def test_large_doc_20MB(self): # test reproducer for MB-29258, # Load a doc which is greater than 20MB # with compression enabled and check if it fails # check with compression_mode as active, passive and off val_error = SDKException.ValueTooLargeException gens_load = self.generate_docs_bigdata(docs_per_day=1, document_size=(self.doc_size * 1024000)) for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gens_load, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) if self.doc_size > 20: if len(task.fail.keys()) == 0: self.log_failure("No failures during large doc insert") for doc_id, doc_result in task.fail.items(): if val_error not in str(doc_result["error"]): self.log_failure("Invalid exception for key %s: %s" % (doc_id, doc_result)) else: if len(task.fail.keys()) != 0: self.log_failure("Failures during large doc insert") for bucket in self.bucket_util.buckets: if self.doc_size > 20: # failed with error "Data Too Big" when document size > 20MB self.bucket_util.verify_stats_all_buckets(0) else: self.bucket_util.verify_stats_all_buckets(1) gens_update = self.generate_docs_bigdata( docs_per_day=1, document_size=(21 * 1024000)) task = self.task.async_load_gen_docs( self.cluster, bucket, gens_update, "update", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) if len(task.fail.keys()) != 1: self.log_failure("Large docs inserted for keys: %s" % task.fail.keys()) if len(task.fail.keys()) == 0: self.log_failure("No failures during large doc insert") for key, crud_result in task.fail.items(): if SDKException.ValueTooLargeException \ not in str(crud_result["error"]): self.log_failure("Unexpected error for key %s: %s" % (key, crud_result["error"])) for doc_id, doc_result in task.fail.items(): if val_error not in str(doc_result["error"]): self.log_failure("Invalid exception for key %s: %s" % (doc_id, doc_result)) self.bucket_util.verify_stats_all_buckets(1) self.validate_test_failure() def test_parallel_cruds(self): data_op_dict = dict() num_items = self.num_items half_of_num_items = self.num_items / 2 supported_d_levels = self.bucket_util.get_supported_durability_levels() exp_values_to_test = [0, 900, 4000, 12999] # Initial doc_loading initial_load = doc_generator(self.key, 0, self.num_items, doc_size=self.doc_size) task = self.task.async_load_gen_docs( self.cluster, self.bucket_util.buckets[0], initial_load, DocLoading.Bucket.DocOps.CREATE, 0, batch_size=100, process_concurrency=8, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) # Create required doc_gens and doc_op task object for doc_op in self.doc_ops: if doc_op == DocLoading.Bucket.DocOps.CREATE: num_items += half_of_num_items gen_start = self.num_items gen_end = self.num_items + half_of_num_items elif doc_op == DocLoading.Bucket.DocOps.DELETE: gen_start = 0 gen_end = half_of_num_items else: gen_start = half_of_num_items gen_end = self.num_items d_level = "" replicate_to = persist_to = 0 if self.num_replicas > 0: replicate_to = randint(1, self.num_replicas) persist_to = randint(0, self.num_replicas + 1) if not self.observe_test and choice([True, False]): d_level = choice(supported_d_levels) self.log.info("Doc_op %s, range (%d, %d), " "replicate_to=%s, persist_to=%s, d_level=%s" % (doc_op, gen_start, gen_end, replicate_to, persist_to, d_level)) data_op_dict[doc_op] = dict() data_op_dict[doc_op]["doc_gen"] = doc_generator( self.key, gen_start, gen_end, doc_size=self.doc_size, mutation_type=doc_op) data_op_dict[doc_op]["task"] = self.task.async_load_gen_docs( self.cluster, self.bucket_util.buckets[0], data_op_dict[doc_op]["doc_gen"], doc_op, exp=choice(exp_values_to_test), compression=self.sdk_compression, persist_to=persist_to, replicate_to=replicate_to, durability=d_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool, process_concurrency=1, batch_size=1, print_ops_rate=False, start_task=False) # Start all tasks for doc_op in self.doc_ops: self.task_manager.add_new_task(data_op_dict[doc_op]["task"]) # Wait for doc_ops to complete and validate final doc value result for doc_op in self.doc_ops: self.task_manager.get_task_result(data_op_dict[doc_op]["task"]) self.log.info("%s task completed" % doc_op) if data_op_dict[doc_op]["task"].fail: self.log_failure("Doc_loading failed for %s: %s" % (doc_op, data_op_dict[doc_op]["task"].fail)) elif doc_op in [ DocLoading.Bucket.DocOps.CREATE, DocLoading.Bucket.DocOps.UPDATE, DocLoading.Bucket.DocOps.REPLACE, DocLoading.Bucket.DocOps.DELETE ]: suppress_err_tbl = False if doc_op == DocLoading.Bucket.DocOps.DELETE: suppress_err_tbl = True self.log.info("Validating %s results" % doc_op) # Read all the values to validate doc_operation values task = self.task.async_validate_docs( self.cluster, self.bucket_util.buckets[0], data_op_dict[doc_op]["doc_gen"], doc_op, 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, sdk_client_pool=self.sdk_client_pool, suppress_error_table=suppress_err_tbl) self.task.jython_task_manager.get_task_result(task) self.validate_test_failure() def test_diag_eval_curl(self): # Check if diag/eval can be done only by local host self.disable_diag_eval_on_non_local_host = \ self.input.param("disable_diag_eval_non_local", False) port = self.cluster.master.port # check if local host can work fine cmd = [] cmd_base = 'curl http://{0}:{1}@localhost:{2}/diag/eval ' \ .format(self.cluster.master.rest_username, self.cluster.master.rest_password, port) command = cmd_base + '-X POST -d \'os:cmd("env")\'' cmd.append(command) command = cmd_base + '-X POST -d \'case file:read_file("/etc/passwd") of {ok, B} -> io:format("~p~n", [binary_to_term(B)]) end.\'' cmd.append(command) shell = RemoteMachineShellConnection(self.cluster.master) for command in cmd: output, error = shell.execute_command(command) self.assertNotEquals("API is accessible from localhost only", output[0]) # Disable allow_nonlocal_eval if not self.disable_diag_eval_on_non_local_host: command = cmd_base + '-X POST -d \'ns_config:set(allow_nonlocal_eval, true).\'' _, _ = shell.execute_command(command) # Check ip address on diag/eval will not work fine # when allow_nonlocal_eval is disabled cmd = [] cmd_base = 'curl http://{0}:{1}@{2}:{3}/diag/eval ' \ .format(self.cluster.master.rest_username, self.cluster.master.rest_password, self.cluster.master.ip, port) command = cmd_base + '-X POST -d \'os:cmd("env")\'' cmd.append(command) command = cmd_base + '-X POST -d \'case file:read_file("/etc/passwd") of {ok, B} -> io:format("~p~n", [binary_to_term(B)]) end.\'' cmd.append(command) for command in cmd: output, error = shell.execute_command(command) if self.disable_diag_eval_on_non_local_host: self.assertEquals("API is accessible from localhost only", output[0]) else: self.assertNotEquals("API is accessible from localhost only", output[0]) def test_MB_40967(self): """ 1. Load initial docs into the bucket 2. Perform continuous reads until get_cmd stats breaks in 'cbstats timings' command """ total_gets = 0 max_gets = 2500000000 bucket = self.bucket_util.buckets[0] doc_gen = doc_generator(self.key, 0, self.num_items, doc_size=1) create_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, "create", 0, batch_size=100, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(create_task) cbstat = dict() kv_nodes = self.cluster_util.get_kv_nodes() for node in kv_nodes: shell = RemoteMachineShellConnection(node) cbstat[node] = Cbstats(shell) self.log.info("Start doc_reads until total_gets cross: %s" % max_gets) read_task = self.task.async_continuous_doc_ops( self.cluster, bucket, doc_gen, op_type="read", batch_size=self.batch_size, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout) self.sleep(60, "Wait for read task to start") while total_gets < max_gets: total_gets = 0 for node in kv_nodes: output, error = cbstat[node].get_timings(bucket.name) if error: self.log_failure("Error during cbstat timings: %s" % error) break get_cmd_found = False for line in output: if "get_cmd_" in line: if "get_cmd_mean" in line: break get_cmd_found = True if not get_cmd_found: self.log.error(output) self.log_failure("cbstat timings get_cmd stats not found") break vb_details = cbstat[node].vbucket_details(bucket.name) for _, vb_stats in vb_details.items(): total_gets += long(vb_stats["ops_get"]) if self.test_failure: break self.sleep( 120, "Total_gets: %s, itr: %s" % (total_gets, read_task.itr_count)) read_task.end_task() self.task_manager.get_task_result(read_task) # Close all shell connections for node in kv_nodes: cbstat[node].shellConn.disconnect() self.validate_test_failure() def test_MB_41510(self): """ 1. Load initial docs into the bucket 2. Perform continuous reads 3. Perform 'mcstat reset' in parallel to the reads 4. Perform 'cbstats timings' command to read the current values 5. Validate there is no crash when stats are getting reset continuously """ def reset_mcstat(bucket_name): mc_stat = dict() for t_node in kv_nodes: shell_conn = RemoteMachineShellConnection(t_node) mc_stat[t_node] = McStat(shell_conn) while not stop_thread: for t_node in mc_stat.keys(): try: mc_stat[t_node].reset(bucket_name) except Exception as mcstat_err: self.log_failure(mcstat_err) if self.test_failure: break for t_node in mc_stat.keys(): mc_stat[t_node].shellConn.disconnect() def get_timings(bucket_name): cb_stat = dict() for t_node in kv_nodes: shell_conn = RemoteMachineShellConnection(t_node) cb_stat[t_node] = Cbstats(shell_conn) while not stop_thread: for t_node in cb_stat.keys(): try: cb_stat[t_node].get_timings(bucket_name) except Exception as cbstat_err: self.log_failure(cbstat_err) if self.test_failure: break for t_node in cb_stat.keys(): cb_stat[t_node].shellConn.disconnect() total_gets = 0 max_gets = 50000000 stop_thread = False bucket = self.bucket_util.buckets[0] cb_stat_obj = dict() kv_nodes = self.cluster_util.get_kv_nodes() for node in self.cluster_util.get_kv_nodes(): shell = RemoteMachineShellConnection(node) cb_stat_obj[node] = Cbstats(shell) doc_gen = doc_generator(self.key, 0, self.num_items, doc_size=1) create_task = self.task.async_load_gen_docs( self.cluster, bucket, doc_gen, "create", 0, batch_size=500, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(create_task) mc_stat_reset_thread = Thread(target=reset_mcstat, args=[bucket.name]) get_timings_thread = Thread(target=get_timings, args=[bucket.name]) mc_stat_reset_thread.start() get_timings_thread.start() read_task = self.task.async_continuous_doc_ops( self.cluster, bucket, doc_gen, op_type="read", batch_size=self.batch_size, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout) while total_gets < max_gets: total_gets = 0 try: for node in cb_stat_obj.keys(): vb_details = cb_stat_obj[node].vbucket_details(bucket.name) for _, vb_stats in vb_details.items(): total_gets += long(vb_stats["ops_get"]) except Exception as err: self.log_failure(err) self.log.info("Total gets: %s" % total_gets) result, core_msg, stream_msg = self.check_coredump_exist( self.servers, force_collect=True) if result is not False: self.log_failure(core_msg + stream_msg) break elif self.test_failure: break self.sleep(60, "Wait before next check") stop_thread = True read_task.end_task() mc_stat_reset_thread.join() get_timings_thread.join() # Close all shell connections for node in cb_stat_obj.keys(): cb_stat_obj[node].shellConn.disconnect() self.validate_test_failure() def verify_stat(self, items, value="active"): mc = MemcachedClient(self.cluster.master.ip, constants.memcached_port) mc.sasl_auth_plain(self.cluster.master.rest_username, self.cluster.master.rest_password) mc.bucket_select('default') stats = mc.stats() self.assertEquals(stats['ep_compression_mode'], value) self.assertEquals(int(stats['ep_item_compressor_num_compressed']), items) self.assertNotEquals(int(stats['vb_active_itm_memory']), int(stats['vb_active_itm_memory_uncompressed'])) def test_compression_active_and_off(self): """ test reproducer for MB-29272, Load some documents with compression mode set to active get the cbstats change compression mode to off and wait for minimum 250ms Load some more documents and check the compression is not done epengine.basic_ops.basic_ops.test_compression_active_and_off,items=10000,compression_mode=active :return: """ # Load some documents with compression mode as active gen_create = doc_generator("eviction1_", start=0, end=self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) gen_create2 = doc_generator("eviction2_", start=0, end=self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, vbuckets=self.cluster_util.vbuckets, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value) def_bucket = self.bucket_util.get_all_buckets()[0] task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) remote = RemoteMachineShellConnection(self.cluster.master) for bucket in self.bucket_util.buckets: # change compression mode to off output, _ = remote.execute_couchbase_cli( cli_command='bucket-edit', cluster_host="localhost:8091", user=self.cluster.master.rest_username, password=self.cluster.master.rest_password, options='--bucket=%s --compression-mode off' % bucket.name) self.assertTrue(' '.join(output).find('SUCCESS') != -1, 'compression mode set to off') # sleep for 10 sec (minimum 250sec) self.sleep(10) # Load data and check stats to see compression # is not done for newly added data task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_create2, "create", 0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, compression=self.sdk_compression, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items * 2) def MB36948(self): node_to_stop = self.servers[0] self.log.info("Adding index/query node") self.task.rebalance([self.cluster.master], [self.servers[2]], [], services=["n1ql,index"]) self.log.info("Creating SDK client connection") client = SDKClient([self.cluster.master], self.bucket_util.buckets[0], compression_settings=self.sdk_compression) self.log.info("Stopping memcached on: %s" % node_to_stop) ssh_conn = RemoteMachineShellConnection(node_to_stop) err_sim = CouchbaseError(self.log, ssh_conn) err_sim.create(CouchbaseError.STOP_MEMCACHED) result = client.crud("create", "abort1", "abort1_val") if not result["status"]: self.log_failure("Async SET failed") result = client.crud("update", "abort1", "abort1_val", durability=self.durability_level, timeout=3, time_unit="seconds") if result["status"]: self.log_failure("Sync write succeeded") if SDKException.DurabilityAmbiguousException not in result["error"]: self.log_failure("Invalid exception for sync_write: %s" % result) self.log.info("Resuming memcached on: %s" % node_to_stop) err_sim.revert(CouchbaseError.STOP_MEMCACHED) self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(1) self.log.info("Closing ssh & SDK connections") ssh_conn.disconnect() client.close() self.validate_test_failure() def do_get_random_key(self): # MB-31548, get_Random key gets hung sometimes. mc = MemcachedClient(self.cluster.master.ip, constants.memcached_port) mc.sasl_auth_plain(self.cluster.master.rest_username, self.cluster.master.rest_password) mc.bucket_select('default') count = 0 while count < 1000000: count += 1 try: mc.get_random_key() except MemcachedError as error: self.fail("<MemcachedError #%d ``%s''>" % (error.status, error.message)) if count % 1000 == 0: self.log.info('The number of iteration is {}'.format(count))
def test_bulk_sync_write_in_progress(self): doc_ops = self.input.param("doc_ops").split(';') shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 doc_load_spec = dict() doc_load_spec["doc_crud"] = dict() doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" if doc_ops[0] == "create": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "update": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "replace": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops[0] == "delete": doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 # Induce error condition for testing for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, async_load=True) self.sleep(5, "Wait for doc ops to reach server") tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = "NONE" for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: # This will support both sync-write and non-sync-writes doc_loader_task_2 = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], doc_ops[1], 0, scope=s_name, collection=c_name, sdk_client_pool=self.sdk_client_pool, batch_size=self.crud_batch_size, process_concurrency=1, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=tem_durability, timeout_secs=3, print_ops_rate=False, skip_read_on_error=True, task_identifier="parallel_task2") self.task.jython_task_manager.get_task_result( doc_loader_task_2) # Validation to verify the sync_in_write_errors # in doc_loader_task_2 failed_docs = doc_loader_task_2.fail if len(failed_docs.keys()) != 1: self.log_failure( "Exception not seen for docs: %s" % failed_docs) valid_exception = self.durability_helper\ .validate_durability_exception( failed_docs, SDKException.AmbiguousTimeoutException, retry_reason=sync_write_in_progress) if not valid_exception: self.log_failure("Got invalid exception") # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loading to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops[0] == "update": for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], "read", batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 \ and json.loads(str(doc_info["value"]) )["mutated"] != 1: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Validate doc_count per collection self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
def test_sub_doc_sync_write_in_progress(self): """ Test to simulate sync_write_in_progress error and validate the behavior This will validate failure in majority of nodes, where durability will surely fail for all CRUDs 1. Select nodes to simulate the error which will affect the durability 2. Enable the specified error_scenario on the selected nodes 3. Perform individual CRUDs and verify sync_write_in_progress errors 4. Validate the end results """ doc_ops = self.input.param("doc_ops", "insert") shell_conn = dict() cbstat_obj = dict() error_sim = dict() vb_info = dict() active_vbs = dict() replica_vbs = dict() vb_info["init"] = dict() doc_load_spec = dict() # Override d_level, error_simulation type based on d_level self.__get_d_level_and_error_to_simulate() target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(node) vb_info["init"] = dict() vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) # Fetch affected nodes' vb_num which are of type=replica active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="active") replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list( self.bucket.name, vbucket_type="replica") target_vbs = replica_vbs if self.durability_level \ == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE: target_vbs = active_vbs target_vbuckets = list() for target_node in target_nodes: target_vbuckets += target_vbs[target_node.ip] else: target_vbuckets = target_vbs[target_nodes[0].ip] if len(target_nodes) > 1: index = 1 while index < len(target_nodes): target_vbuckets = list( set(target_vbuckets).intersection( set(target_vbs[target_nodes[index].ip]))) index += 1 amb_timeout = SDKException.AmbiguousTimeoutException kv_sync_write_in_progress = \ SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS doc_not_found_exception = SDKException.DocumentNotFoundException self.load_data_for_sub_doc_ops() doc_load_spec["doc_crud"] = dict() doc_load_spec["subdoc_crud"] = dict() doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \ = "test_collections" doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5 doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all" doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60 # Acquire SDK client from the pool for performing doc_ops locally client = self.sdk_client_pool.get_client_for_bucket(self.bucket) # Override the crud_batch_size self.crud_batch_size = 5 # Update mutation spec based on the required doc_operation if doc_ops == DocLoading.Bucket.DocOps.CREATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops in DocLoading.Bucket.DocOps.UPDATE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.DocOps.DELETE: doc_load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.INSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.UPSERT: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 1 elif doc_ops == DocLoading.Bucket.SubDocOps.REMOVE: doc_load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 1 # This is to support both sync-write and non-sync-writes tem_durability = self.durability_level if self.with_non_sync_writes: tem_durability = Bucket.DurabilityLevel.NONE # Perform specified action for node in target_nodes: error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) self.sleep(5, "Wait for error simulation to take effect") # Initialize tasks and store the task objects doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.cluster.buckets, doc_load_spec, mutation_num=2, batch_size=1, async_load=True) # Start the doc_loader_task self.sleep(10, "Wait for task_1 CRUDs to reach server") for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: key, _ = c_meta[op_type]["doc_gen"].next() expected_exception = amb_timeout retry_reason = kv_sync_write_in_progress if doc_ops == "create": expected_exception = doc_not_found_exception retry_reason = None for sub_doc_op in [ DocLoading.Bucket.SubDocOps.INSERT, DocLoading.Bucket.SubDocOps.UPSERT, DocLoading.Bucket.SubDocOps.REMOVE ]: val = ["my_mutation", "val"] if sub_doc_op \ == DocLoading.Bucket.SubDocOps.REMOVE: val = "mutated" result = client.crud(sub_doc_op, key, val, durability=tem_durability, timeout=2) if result[0]: self.log_failure("Doc crud succeeded for %s" % op_type) elif expected_exception \ not in str(result[1][key]["error"]): self.log_failure( "Invalid exception for key %s: %s" % (key, result[1][key]["error"])) elif retry_reason is not None and \ retry_reason \ not in str(result[1][key]["error"]): self.log_failure( "Retry reason missing for key %s: %s" % (key, result[1][key]["error"])) # Revert the introduced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Wait for doc_loader_task_1 to complete self.task.jython_task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed") # Validate docs for update success or not if doc_ops == DocLoading.Bucket.DocOps.UPDATE: for bucket, s_dict in doc_loading_task.loader_spec.items(): for s_name, c_dict in s_dict["scopes"].items(): for c_name, c_meta in c_dict["collections"].items(): for op_type in c_meta: c_meta[op_type]["doc_gen"].reset() read_task = self.task.async_load_gen_docs( self.cluster, self.bucket, c_meta[op_type]["doc_gen"], DocLoading.Bucket.DocOps.READ, batch_size=self.crud_batch_size, process_concurrency=1, timeout_secs=self.sdk_timeout) self.task_manager.get_task_result(read_task) for key, doc_info in read_task.success.items(): if doc_info["cas"] != 0 and \ json.loads(str(doc_info["value"]) )["mutated"] != 2: self.log_failure( "Update failed for key %s: %s" % (key, doc_info)) # Release the acquired SDK client self.sdk_client_pool.release_client(client) # Verify initial doc load count self.bucket_util._wait_for_stats_all_buckets(self.cluster, self.cluster.buckets) self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure()
def test_sub_doc_with_persistence_issues(self): """ 1. Select nodes from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations met the durability condition """ if self.durability_level.upper() in [ Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE, Bucket.DurabilityLevel.PERSIST_TO_MAJORITY ]: self.log.critical("Test not valid for persistence durability") return error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] load_spec = dict() load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 20 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 10 self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) # Create new docs for sub-doc operations to run self.load_data_for_sub_doc_ops() self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform CRUDs with induced error scenario is active self.log.info("Perform 'insert', 'upsert', 'remove' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=0, async_load=True) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=1) # Wait for doc_loading to complete and validate the doc ops self.task_manager.get_task_result(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with persistence issue") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] self.assertTrue(val, msg="Failover stats not updated") # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
class CrashTest(BaseTestCase): def setUp(self): super(CrashTest, self).setUp() self.doc_ops = self.input.param("doc_ops", None) self.process_name = self.input.param("process", None) self.service_name = self.input.param("service", "data") self.sig_type = self.input.param("sig_type", "SIGKILL").upper() self.target_node = self.input.param("target_node", "active") self.pre_warmup_stats = {} self.timeout = 120 self.new_docs_to_add = 10000 if self.doc_ops is not None: self.doc_ops = self.doc_ops.split(";") nodes_init = self.cluster.servers[1:self.nodes_init] \ if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) if not self.atomicity: self.durability_helper = DurabilityHelper( self.log, self.nodes_init, durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) self.bucket_util.create_default_bucket( bucket_type=self.bucket_type, ram_quota=self.bucket_size, replica=self.num_replicas, compression_mode="off", storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy) self.bucket_util.add_rbac_user() if self.sdk_client_pool: self.log.info("Creating SDK clients for client_pool") for bucket in self.bucket_util.buckets: self.sdk_client_pool.create_clients( bucket, [self.cluster.master], self.sdk_pool_capacity, compression_settings=self.sdk_compression) verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = self.num_items # Load initial documents into the buckets self.log.info("Loading initial documents") gen_create = doc_generator(self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_create, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) self.task.jython_task_manager.get_task_result(task) else: for bucket in self.bucket_util.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_create, DocLoading.Bucket.DocOps.CREATE, self.maxttl, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, batch_size=10, process_concurrency=8, sdk_client_pool=self.sdk_client_pool) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() # Verify cbstats vbucket-details stats_failed = \ self.durability_helper.verify_vbucket_details_stats( bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if stats_failed: self.fail("Cbstats verification failed") self.bucket_util.verify_stats_all_buckets(self.num_items) self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.log.info("==========Finished CrashTest setup========") def tearDown(self): super(CrashTest, self).tearDown() def getTargetNode(self): if len(self.cluster.nodes_in_cluster) > 1: return self.cluster.nodes_in_cluster[randint( 0, self.nodes_init - 1)] return self.cluster.master def getVbucketNumbers(self, shell_conn, bucket_name, replica_type): cb_stats = Cbstats(shell_conn) return cb_stats.vbucket_list(bucket_name, replica_type) def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) def_bucket = self.bucket_util.buckets[0] target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = self.getVbucketNumbers(remote, def_bucket.name, self.target_node) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return # Create doc_generator targeting only the active/replica vbuckets # present in the target_node gen_load = doc_generator(self.key, self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_load, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) else: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", exp=0, batch_size=1, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, skip_read_on_error=True) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(task) if not self.atomicity: if len(task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Create SDK connection for CRUD retries sdk_client = SDKClient([self.cluster.master], def_bucket) for doc_key, crud_result in task.fail.items(): result = sdk_client.crud("create", doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection sdk_client.close() # Update self.num_items self.num_items += self.new_docs_to_add if not self.atomicity: # Validate doc count self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) self.validate_test_failure() def test_crash_process(self): """ 1. Starting loading docs into the default bucket 2. Crash the requested process, which will not impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ def_bucket = self.bucket_util.buckets[0] target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) target_vbuckets = range(0, self.cluster_util.vbuckets) retry_exceptions = list() # If Memcached is killed, we should not perform KV ops on # particular node. If not we can target all nodes for KV operation. if self.process_name == "memcached": target_vbuckets = self.getVbucketNumbers(remote, def_bucket.name, self.target_node) if self.target_node == "active": retry_exceptions = [SDKException.TimeoutException] if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return # Create doc_generator targeting only the active/replica vbuckets # present in the target_node gen_load = doc_generator(self.key, self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) if self.atomicity: task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, gen_load, "create", exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) else: task = self.task.async_load_gen_docs( self.cluster, def_bucket, gen_load, "create", exp=0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, skip_read_on_error=True) task_info = dict() task_info[task] = self.bucket_util.get_doc_op_info_dict( def_bucket, "create", 0, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout, time_unit="seconds", retry_exceptions=retry_exceptions) self.sleep(10, "Wait for doc_ops to start") self.log.info("Killing {0}:{1} on node {2}".format( self.process_name, self.service_name, target_node.ip)) remote.kill_process(self.process_name, self.service_name, signum=signum[self.sig_type]) remote.disconnect() # Wait for tasks completion and validate failures if self.atomicity: self.task.jython_task_manager.get_task_result(task) if not self.atomicity: self.bucket_util.verify_doc_op_task_exceptions( task_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(task_info) # Update self.num_items self.num_items += self.new_docs_to_add # Verification stats verification_dict = dict() verification_dict["ops_create"] = self.num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = self.num_items if self.bucket_type == Bucket.Type.EPHEMERAL \ and self.process_name == "memcached": self.sleep(10, "Wait for memcached to recover from the crash") result = self.task.rebalance(self.servers[:self.nodes_init], [], []) self.assertTrue(result, "Rebalance failed") # Validate doc count if not self.atomicity: self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.num_items) if self.process_name != "memcached": stats_failed = \ self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if stats_failed: self.fail("Cbstats verification failed") def test_process_error_on_nodes(self): """ Test to validate OoO returns feature 1. Start parallel CRUDs using single client 2. Perform process crash / stop with doc_ops in parallel 3. Make sure no crash or ep_eng issue is seen with the err_simulation """ tasks = list() node_data = dict() bucket = self.bucket_util.buckets[0] revert_errors = [ CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_SERVER, CouchbaseError.STOP_BEAMSMP, CouchbaseError.STOP_PERSISTENCE ] # Overriding sdk_timeout to max self.sdk_timeout = 60 # Disable auto-failover to avoid failover of nodes status = RestConnection(self.cluster.master) \ .update_autofailover_settings(False, 120, False) self.assertTrue(status, msg="Failure during disabling auto-failover") # Can take 'all_nodes' / 'single node' crash_on = self.input.param("crash_on", "single_node") error_to_simulate = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) num_times_to_affect = self.input.param("times_to_affect", 20) nodes_to_affect = self.cluster_util.get_kv_nodes() if crash_on == "single_node": nodes_to_affect = [choice(nodes_to_affect)] create_gen = doc_generator(self.key, self.num_items, self.num_items * 2) update_gen = doc_generator(self.key, 0, self.num_items / 2) delete_gen = doc_generator(self.key, self.num_items / 2, self.num_items) for node in nodes_to_affect: shell = RemoteMachineShellConnection(node) node_data[node] = dict() node_data[node]["cb_err"] = CouchbaseError(self.log, shell) self.log.info("Starting doc-ops") for doc_op in self.doc_ops: load_gen = update_gen if doc_op == DocLoading.Bucket.DocOps.CREATE: load_gen = create_gen elif doc_op == DocLoading.Bucket.DocOps.DELETE: load_gen = delete_gen task = self.task.async_load_gen_docs( self.cluster, bucket, load_gen, doc_op, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool, batch_size=10, process_concurrency=1, skip_read_on_error=True, print_ops_rate=False) tasks.append(task) self.log.info("Starting error_simulation on %s" % nodes_to_affect) for itr in range(1, num_times_to_affect + 1): self.log.info("Iteration :: %d" % itr) for node in nodes_to_affect: node_data[node]["cb_err"].create(error_to_simulate, bucket.name) if error_to_simulate in revert_errors: self.sleep(30, "Sleep before reverting the error") for node in nodes_to_affect: node_data[node]["cb_err"].revert(error_to_simulate, bucket.name) else: self.sleep(10, "Wait for process to come back online") # Wait for doc_ops to complete for task in tasks: self.task_manager.get_task_result(task)
def test_sub_doc_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() def_bucket = self.bucket_util.buckets[0] self.load_data_for_sub_doc_ops() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs = cbstat_obj[node.ip].vbucket_list( def_bucket.name, "active") active_vbs_in_target_nodes += active_vbs vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( def_bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() # load_spec["target_vbuckets"] = list(set(target_vbuckets) # ^ set(active_vbs_in_target_nodes)) load_spec["doc_crud"] = dict() load_spec["subdoc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 10 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 50 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 25 load_spec["subdoc_crud"][ MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 25 self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=def_bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud(mutation_num=2) # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Sub_doc CRUDs failed with process crash") # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=def_bucket.name) # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(def_bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(def_bucket.name) # Failover validation val = \ failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats not updated after error condition" self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Disconnect the shell connection for node in target_nodes: shell_conn[node.ip].disconnect() self.validate_test_failure() # Doc count validation self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets()
class CrashTest(CollectionBase): def setUp(self): super(CrashTest, self).setUp() self.doc_ops = self.input.param("doc_ops", None) self.process_name = self.input.param("process", None) self.service_name = self.input.param("service", "data") self.sig_type = self.input.param("sig_type", "SIGKILL").upper() self.target_node = self.input.param("target_node", "active") self.client_type = self.input.param("client_type", "sdk").lower() self.N1qltxn = self.input.param("N1qltxn", False) self.pre_warmup_stats = dict() self.timeout = 120 self.new_docs_to_add = 10000 if self.doc_ops is not None: self.doc_ops = self.doc_ops.split(";") if not self.atomicity: self.durability_helper = DurabilityHelper( self.log, self.nodes_init, durability=self.durability_level, replicate_to=self.replicate_to, persist_to=self.persist_to) verification_dict = dict() verification_dict["ops_create"] = \ self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = \ verification_dict["ops_create"] # Load initial documents into the buckets transaction_gen_create = doc_generator( "transaction_key", 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) gen_create = doc_generator( self.key, 0, self.num_items, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets) if self.atomicity: transaction_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.cluster.buckets, transaction_gen_create, DocLoading.Bucket.DocOps.CREATE, exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) self.task.jython_task_manager.get_task_result(transaction_task) for bucket in self.cluster.buckets: task = self.task.async_load_gen_docs( self.cluster, bucket, gen_create, DocLoading.Bucket.DocOps.CREATE, self.maxttl, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, batch_size=10, process_concurrency=8) self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets) self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items += self.num_items verification_dict["ops_create"] += self.num_items if self.durability_level: verification_dict["sync_write_committed_count"] += \ self.num_items # Verify cbstats vbucket-details stats_failed = self.durability_helper.verify_vbucket_details_stats( bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if self.atomicity is False: if stats_failed: self.fail("Cbstats verification failed") self.bucket_util.verify_stats_all_buckets( self.cluster, self.cluster.buckets[0].scopes[ CbServer.default_scope].collections[ CbServer.default_collection].num_items) self.bucket = self.cluster.buckets[0] if self.N1qltxn: self.n1ql_server = self.cluster_util.get_nodes_from_services_map( service_type="n1ql", get_all_nodes=True) self.n1ql_helper = N1QLHelper(server=self.n1ql_server, use_rest=True, buckets=self.cluster.buckets, log=self.log, scan_consistency='REQUEST_PLUS', num_collection=3, num_buckets=1, num_savepoints=1, override_savepoint=False, num_stmt=10, load_spec=self.data_spec_name) self.bucket_col = self.n1ql_helper.get_collections() self.stmts = self.n1ql_helper.get_stmt(self.bucket_col) self.stmts = self.n1ql_helper.create_full_stmts(self.stmts) self.log.info("==========Finished CrashTest setup========") def tearDown(self): super(CrashTest, self).tearDown() def getTargetNode(self): if len(self.cluster.nodes_in_cluster) > 1: return self.cluster.nodes_in_cluster[randint(0, self.nodes_init-1)] return self.cluster.master def start_doc_loading_tasks(self, target_vbuckets, scope_name, collection_obj): # Create doc_generator targeting only the active/replica vbuckets # present in the target_node transaction_gen_load = doc_generator( "transaction_key", self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) gen_load = doc_generator( self.key, self.num_items, self.new_docs_to_add, key_size=self.key_size, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbuckets, vbuckets=self.cluster_util.vbuckets) if self.atomicity: self.transaction_load_task = \ self.task.async_load_gen_docs_atomicity( self.cluster, self.cluster.buckets, transaction_gen_load, DocLoading.Bucket.DocOps.CREATE, exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, sync=self.sync) collection_obj.num_items += self.new_docs_to_add elif self.N1qltxn: self.N1ql_load_task = self.task.async_n1qlTxn_query( self.stmts, n1ql_helper=self.n1ql_helper, commit=True, scan_consistency="REQUEST_PLUS") self.doc_loading_task = self.task.async_load_gen_docs( self.cluster, self.bucket, gen_load, DocLoading.Bucket.DocOps.CREATE, exp=0, batch_size=10, process_concurrency=8, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, scope=scope_name, collection=collection_obj.name, skip_read_on_error=True) collection_obj.num_items += self.new_docs_to_add @staticmethod def getVbucketNumbers(shell_conn, bucket_name, replica_type): cb_stats = Cbstats(shell_conn) return cb_stats.vbucket_list(bucket_name, replica_type) def test_create_remove_scope_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after scope create/delete 3. Initiate scope creation/deletion under the bucket 4. Validate the outcome of scope creation/deletion """ def create_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.create_scope(scope) self.bucket_util.create_scope_object(bucket_obj, {"name": scope}) elif client_type == "rest": self.bucket_util.create_scope(self.cluster.master, bucket_obj, {"name": scope}) else: self.log_failure("Invalid client_type provided") def remove_scope(client_type, bucket_obj, scope): if client_type == "sdk": client.drop_scope(scope) self.bucket_util.mark_scope_as_dropped(bucket_obj, scope) elif client_type == "rest": self.bucket_util.drop_scope(self.cluster.master, bucket_obj, scope) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None task = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) # Always use a random scope name to create/remove # since CREATE/DROP not supported for default scope self.scope_name = \ BucketUtils.get_random_name(max_length=CbServer.max_scope_name_len) # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] client = self.sdk_client_pool.get_client_for_bucket(self.bucket) use_client = sample(["sdk", "rest"], 1)[0] if action == "remove": # Create a scope to be removed create_scope(use_client, self.bucket, self.scope_name) # Create a error scenario shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=200, process_concurrency=4, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool) if action == "create": create_scope(self.client_type, self.bucket, self.scope_name) elif action == "remove": remove_scope(self.client_type, self.bucket, self.scope_name) if crash_during == "post_action": cb_error.create(crash_type) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) if data_load_option == "mutate_default_collection": self.task_manager.get_task_result(task) # Close SSH and SDK connections shell.disconnect() if self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure() def test_create_remove_collection_with_node_crash(self): """ 1. Select a error scenario to simulate in random 2. Create error scenario either before or after collection action 3. Initiate collection creation/deletion under the bucket 4. Validate the outcome of collection creation/deletion """ def create_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.create_collection(collection, scope) self.bucket_util.create_collection_object(bucket_obj, scope, {"name": collection}) elif client_type == "rest": self.bucket_util.create_collection(self.cluster.master, bucket_obj, scope, {"name": collection}) else: self.log_failure("Invalid client_type provided") def remove_collection(client_type, bucket_obj, scope, collection): if client_type == "sdk": client.drop_collection(scope, collection) self.bucket_util.mark_collection_as_dropped(bucket_obj, scope, collection) elif client_type == "rest": self.bucket_util.drop_collection(self.cluster.master, bucket_obj, scope, collection) else: self.log_failure("Invalid client_type provided") kv_nodes = self.cluster_util.get_kv_nodes() if len(kv_nodes) == 1: self.fail("Need atleast two KV nodes to run this test") client = None task = None action = self.input.param("action", "create") crash_during = self.input.param("crash_during", "pre_action") data_load_option = self.input.param("data_load_option", None) crash_type = self.input.param("simulate_error", CouchbaseError.KILL_MEMCACHED) if self.scope_name != CbServer.default_scope: self.scope_name = \ BucketUtils.get_random_name( max_length=CbServer.max_scope_name_len) self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": self.scope_name}) if self.collection_name != CbServer.default_collection: self.collection_name = \ BucketUtils.get_random_name( max_length=CbServer.max_collection_name_len) # Select a KV node other than master node from the cluster node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]] client = self.sdk_client_pool.get_client_for_bucket(self.bucket) use_client = sample(["sdk", "rest"], 1)[0] if action == "remove" \ and self.collection_name != CbServer.default_collection: # Create a collection to be removed create_collection(use_client, self.bucket, self.scope_name, self.collection_name) # Create a error scenario self.log.info("Selected scenario for test '%s'" % crash_type) shell = RemoteMachineShellConnection(node_to_crash) cb_error = CouchbaseError(self.log, shell) cbstat_obj = Cbstats(shell) active_vbs = cbstat_obj.vbucket_list(self.bucket.name, vbucket_type="active") target_vbuckets = list( set(range(0, 1024)).difference(set(active_vbs))) doc_gen = doc_generator(self.key, 0, 1000, target_vbucket=target_vbuckets) if crash_during == "pre_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": task = self.task.async_load_gen_docs( self.cluster, self.bucket, doc_gen, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=200, process_concurrency=8, compression=self.sdk_compression, durability=self.durability_level, timeout_secs=self.sdk_timeout) if action == "create": create_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) elif action == "remove": remove_collection(self.client_type, self.bucket, self.scope_name, self.collection_name) if crash_during == "post_action": cb_error.create(crash_type) if data_load_option == "mutate_default_collection": self.task_manager.get_task_result(task) self.sleep(60, "Wait before reverting the error scenario") cb_error.revert(crash_type) # Close SSH and SDK connections shell.disconnect() if self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) self.validate_test_failure() def test_stop_process(self): """ 1. Starting loading docs into the default bucket 2. Stop the requested process, which will impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ error_to_simulate = self.input.param("simulate_error", None) target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) target_vbuckets = CrashTest.getVbucketNumbers( remote, self.bucket.name, self.target_node) bucket_dict = BucketUtils.get_random_collections( self.cluster.buckets, req_num=1, consider_scopes="all", consider_buckets="all") bucket = BucketUtils.get_bucket_obj(self.cluster.buckets, bucket_dict.keys()[0]) scope_name = bucket_dict[bucket.name]["scopes"].keys()[0] collection_name = bucket_dict[bucket.name][ "scopes"][scope_name]["collections"].keys()[0] scope = BucketUtils.get_scope_obj( bucket, scope_name) collection = BucketUtils.get_collection_obj(scope, collection_name) if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return self.start_doc_loading_tasks(target_vbuckets, scope_name, collection) # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() # Wait for doc loading task to complete self.task.jython_task_manager.get_task_result(self.doc_loading_task) if self.atomicity: self.task.jython_task_manager.get_task_result( self.transaction_load_task) elif self.N1qltxn: self.task.jython_task_manager.get_task_result( self.N1ql_load_task) if len(self.doc_loading_task.fail.keys()) != 0: if self.target_node == "active" or self.num_replicas in [2, 3]: self.log_failure("Unwanted failures for keys: %s" % self.doc_loading_task.fail.keys()) validate_passed = \ self.durability_helper.validate_durability_exception( self.doc_loading_task.fail, SDKException.DurabilityAmbiguousException) if not validate_passed: self.log_failure("Unwanted exception seen during validation") # Get SDK client for CRUD retries sdk_client = self.sdk_client_pool.get_client_for_bucket(self.bucket) for doc_key, crud_result in self.doc_loading_task.fail.items(): result = sdk_client.crud(DocLoading.Bucket.DocOps.CREATE, doc_key, crud_result["value"], replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Retry of doc_key %s failed: %s" % (doc_key, result["error"])) # Close the SDK connection self.sdk_client_pool.release_client(sdk_client) self.validate_test_failure() self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets) # Update self.num_items and validate docs per collection if not self.N1qltxn and self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster) def test_crash_process(self): """ 1. Starting loading docs into the default bucket 2. Crash the requested process, which will not impact the memcached operations 3. Wait for load bucket task to complete 4. Validate the docs for durability """ def_bucket = self.cluster.buckets[0] target_node = self.getTargetNode() remote = RemoteMachineShellConnection(target_node) target_vbuckets = range(0, self.cluster_util.vbuckets) retry_exceptions = list() self.transaction_load_task = None self.doc_loading_task = None self.N1ql_load_task = None # If Memcached is killed, we should not perform KV ops on # particular node. If not we can target all nodes for KV operation. if self.process_name == "memcached": target_vbuckets = CrashTest.getVbucketNumbers( remote, def_bucket.name, self.target_node) if self.target_node == "active": retry_exceptions = [SDKException.TimeoutException] if len(target_vbuckets) == 0: self.log.error("No target vbucket list generated to load data") remote.disconnect() return bucket_dict = BucketUtils.get_random_collections( self.cluster.buckets, req_num=1, consider_scopes="all", consider_buckets="all") bucket = BucketUtils.get_bucket_obj(self.cluster.buckets, bucket_dict.keys()[0]) scope_name = bucket_dict[bucket.name]["scopes"].keys()[0] collection_name = bucket_dict[bucket.name][ "scopes"][scope_name]["collections"].keys()[0] scope = BucketUtils.get_scope_obj( bucket, scope_name) collection = BucketUtils.get_collection_obj( scope, collection_name) self.start_doc_loading_tasks(target_vbuckets, scope_name, collection) task_info = dict() task_info[self.doc_loading_task] = \ self.bucket_util.get_doc_op_info_dict( def_bucket, DocLoading.Bucket.DocOps.CREATE, 0, replicate_to=self.replicate_to, persist_to=self.persist_to, durability=self.durability_level, timeout=self.sdk_timeout, time_unit="seconds", retry_exceptions=retry_exceptions) self.sleep(10, "Wait for doc_ops to start") self.log.info("Killing {0}:{1} on node {2}" .format(self.process_name, self.service_name, target_node.ip)) remote.kill_process(self.process_name, self.service_name, signum=signum[self.sig_type]) remote.disconnect() # Wait for tasks completion and validate failures if self.transaction_load_task: self.task.jython_task_manager.get_task_result( self.transaction_load_task) if self.N1qltxn: self.task.jython_task_manager.get_task_result( self.N1ql_load_task) self.task_manager.get_task_result(self.doc_loading_task) self.bucket_util.verify_doc_op_task_exceptions(task_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(task_info) # Verification stats verification_dict = dict() verification_dict["ops_create"] = 2*self.num_items verification_dict["sync_write_aborted_count"] = 0 verification_dict["rollback_item_count"] = 0 verification_dict["pending_writes"] = 0 if self.durability_level: verification_dict["sync_write_committed_count"] = 2*self.num_items if self.bucket_type == Bucket.Type.EPHEMERAL \ and self.process_name == "memcached": result = self.task.rebalance(self.servers[:self.nodes_init], [], []) self.assertTrue(result, "Rebalance failed") # Validate doc count if self.process_name != "memcached": stats_failed = \ self.durability_helper.verify_vbucket_details_stats( def_bucket, self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=verification_dict) if stats_failed: self.fail("Cbstats verification failed") # Doc count validation per collection if not self.N1qltxn and self.atomicity is False: self.bucket_util.validate_docs_per_collections_all_buckets( self.cluster)
class UpgradeTests(UpgradeBase): def setUp(self): super(UpgradeTests, self).setUp() self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster)) self.verification_dict = dict() self.verification_dict["ops_create"] = self.num_items self.verification_dict["ops_delete"] = 0 def tearDown(self): super(UpgradeTests, self).tearDown() def __trigger_cbcollect(self, log_path): self.log.info("Triggering cb_collect_info") rest = RestConnection(self.cluster.master) nodes = rest.get_nodes() status = self.cluster_util.trigger_cb_collect_on_cluster(rest, nodes) if status is True: self.cluster_util.wait_for_cb_collect_to_complete(rest) status = self.cluster_util.copy_cb_collect_logs( rest, nodes, self.cluster, log_path) if status is False: self.log_failure("API copy_cb_collect_logs detected failure") else: self.log_failure("API perform_cb_collect returned False") return status def __play_with_collection(self): # Client based scope/collection crud tests client = self.sdk_client_pool.get_client_for_bucket(self.bucket) scope_name = self.bucket_util.get_random_name( max_length=CbServer.max_scope_name_len) collection_name = self.bucket_util.get_random_name( max_length=CbServer.max_collection_name_len) # Create scope using SDK client client.create_scope(scope_name) # Create collection under default scope and custom scope client.create_collection(collection_name, CbServer.default_scope) client.create_collection(collection_name, scope_name) # Drop created collections client.drop_collection(CbServer.default_scope, collection_name) client.drop_collection(scope_name, collection_name) # Drop created scope using SDK client client.drop_scope(scope_name) # MB-44092 - Collection load not working with pre-existing connections DocLoaderUtils.sdk_client_pool = SDKClientPool() self.log.info("Creating required SDK clients for client_pool") for bucket in self.bucket_util.buckets: DocLoaderUtils.sdk_client_pool.create_clients( bucket, [self.cluster.master], 1, compression_settings=self.sdk_compression) # Create scopes/collections phase collection_load_spec = \ self.bucket_util.get_crud_template_from_package("initial_load") collection_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0 collection_load_spec["doc_crud"][ MetaCrudParams.DocCrud.NUM_ITEMS_FOR_NEW_COLLECTIONS] = 5000 collection_load_spec[ MetaCrudParams.SCOPES_TO_ADD_PER_BUCKET] = 5 collection_load_spec[ MetaCrudParams.COLLECTIONS_TO_ADD_FOR_NEW_SCOPES] = 10 collection_load_spec[ MetaCrudParams.COLLECTIONS_TO_ADD_PER_BUCKET] = 50 collection_task = \ self.bucket_util.run_scenario_from_spec(self.task, self.cluster, self.bucket_util.buckets, collection_load_spec, mutation_num=1, batch_size=500) if collection_task.result is False: self.log_failure("Collection task failed") return self.bucket_util._wait_for_stats_all_buckets() self.bucket_util._wait_for_stats_all_buckets(cbstat_cmd="all", stat_name="ep_queue_size", timeout=60) self.bucket_util.validate_docs_per_collections_all_buckets() # Drop and recreate scope/collections collection_load_spec = \ self.bucket_util.get_crud_template_from_package("initial_load") collection_load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0 collection_load_spec[MetaCrudParams.COLLECTIONS_TO_DROP] = 10 collection_load_spec[MetaCrudParams.SCOPES_TO_DROP] = 2 collection_task = \ self.bucket_util.run_scenario_from_spec(self.task, self.cluster, self.bucket_util.buckets, collection_load_spec, mutation_num=1, batch_size=500) if collection_task.result is False: self.log_failure("Drop scope/collection failed") return # MB-44092 - Close client_pool after collection ops DocLoaderUtils.sdk_client_pool.shutdown() self.bucket_util._wait_for_stats_all_buckets() self.bucket_util._wait_for_stats_all_buckets(cbstat_cmd="all", stat_name="ep_queue_size", timeout=60) self.bucket_util.validate_docs_per_collections_all_buckets() def test_upgrade(self): create_batch_size = 10000 update_task = None t_durability_level = "" if self.cluster_supports_sync_write: t_durability_level = Bucket.DurabilityLevel.MAJORITY if self.upgrade_with_data_load: self.log.info("Starting async doc updates") update_task = self.task.async_continuous_doc_ops( self.cluster, self.bucket, self.gen_load, op_type=DocLoading.Bucket.DocOps.UPDATE, process_concurrency=1, persist_to=1, replicate_to=1, durability=t_durability_level, timeout_secs=30) create_gen = doc_generator(self.key, self.num_items, self.num_items+create_batch_size) self.log.info("Upgrading cluster nodes to target version") node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version) self.cluster_util.print_cluster_stats() # Validate sync_write results after upgrade if self.atomicity: create_batch_size = 10 create_gen = doc_generator( self.key, self.num_items, self.num_items+create_batch_size) sync_write_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, create_gen, DocLoading.Bucket.DocOps.CREATE, process_concurrency=1, transaction_timeout=self.transaction_timeout, record_fail=True) else: sync_write_task = self.task.async_load_gen_docs( self.cluster, self.bucket, create_gen, DocLoading.Bucket.DocOps.CREATE, durability=self.durability_level, timeout_secs=self.sdk_timeout, sdk_client_pool=self.sdk_client_pool, process_concurrency=4, skip_read_on_error=True, suppress_error_table=True) self.task_manager.get_task_result(sync_write_task) node_to_upgrade = self.fetch_node_to_upgrade() if self.atomicity: self.sleep(10) current_items = self.bucket_util.get_bucket_current_item_count( self.cluster, self.bucket) if node_to_upgrade is None: if current_items < self.num_items+create_batch_size: self.log_failure( "Failures after cluster upgrade {} {}" .format(current_items, self.num_items+create_batch_size)) elif current_items > self.num_items: self.log_failure( "SyncWrite succeeded with mixed mode cluster") else: if node_to_upgrade is None: if sync_write_task.fail.keys(): self.log_failure("Failures after cluster upgrade") else: self.num_items += create_batch_size self.bucket.scopes[ CbServer.default_scope].collections[ CbServer.default_collection] \ .num_items += create_batch_size elif self.cluster_supports_sync_write: if sync_write_task.fail: self.log.error("SyncWrite failed: %s" % sync_write_task.fail) self.log_failure("SyncWrite failed during upgrade") else: self.num_items += create_batch_size self.bucket.scopes[ CbServer.default_scope].collections[ CbServer.default_collection] \ .num_items += create_batch_size create_gen = doc_generator( self.key, self.num_items, self.num_items + create_batch_size) elif len(sync_write_task.fail.keys()) != create_batch_size: self.log_failure( "SyncWrite succeeded with mixed mode cluster") else: for doc_id, doc_result in sync_write_task.fail.items(): if SDKException.FeatureNotAvailableException \ not in str(doc_result["error"]): self.log_failure("Invalid exception for %s: %s" % (doc_id, doc_result)) # Halt further upgrade if test has failed during current upgrade if self.test_failure is not None: break # Validate default collection stats before collection ops self.bucket_util._wait_for_stats_all_buckets(cbstat_cmd="all", stat_name="ep_queue_size", timeout=60) self.bucket_util.validate_docs_per_collections_all_buckets() # Play with collection if upgrade was successful if not self.test_failure: self.__play_with_collection() if self.upgrade_with_data_load: # Wait for update_task to complete update_task.end_task() self.task_manager.get_task_result(update_task) self.validate_test_failure() def test_bucket_durability_upgrade(self): update_task = None self.sdk_timeout = 60 create_batch_size = 10000 if self.atomicity: create_batch_size = 10 # To make sure sync_write can we supported by initial cluster version sync_write_support = True if float(self.initial_version[0:3]) < 6.5: sync_write_support = False if sync_write_support: self.verification_dict["rollback_item_count"] = 0 self.verification_dict["sync_write_aborted_count"] = 0 if self.upgrade_with_data_load: self.log.info("Starting async doc updates") update_task = self.task.async_continuous_doc_ops( self.cluster, self.bucket, self.gen_load, op_type=DocLoading.Bucket.DocOps.UPDATE, process_concurrency=1, persist_to=1, replicate_to=1, timeout_secs=30) self.log.info("Upgrading cluster nodes to target version") node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version) try: self.cluster.update_master_using_diag_eval( self.cluster.servers[0]) except Exception: self.cluster.update_master_using_diag_eval( self.cluster.servers[self.nodes_init-1]) create_gen = doc_generator(self.key, self.num_items, self.num_items+create_batch_size) # Validate sync_write results after upgrade if self.atomicity: sync_write_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, create_gen, DocLoading.Bucket.DocOps.CREATE, process_concurrency=1, transaction_timeout=self.transaction_timeout, record_fail=True) else: sync_write_task = self.task.async_load_gen_docs( self.cluster, self.bucket, create_gen, DocLoading.Bucket.DocOps.CREATE, timeout_secs=self.sdk_timeout, process_concurrency=4, sdk_client_pool=self.sdk_client_pool, skip_read_on_error=True, suppress_error_table=True) self.task_manager.get_task_result(sync_write_task) self.num_items += create_batch_size retry_index = 0 while retry_index < 5: self.sleep(3, "Wait for num_items to match") current_items = self.bucket_util.get_bucket_current_item_count( self.cluster, self.bucket) if current_items == self.num_items: break self.log.debug("Num_items mismatch. Expected: %s, Actual: %s" % (self.num_items, current_items)) # Doc count validation self.cluster_util.print_cluster_stats() self.verification_dict["ops_create"] += create_batch_size self.summary.add_step("Upgrade %s" % node_to_upgrade.ip) # Halt further upgrade if test has failed during current upgrade if self.test_failure: break node_to_upgrade = self.fetch_node_to_upgrade() if self.upgrade_with_data_load: # Wait for update_task to complete update_task.end_task() self.task_manager.get_task_result(update_task) else: self.verification_dict["ops_update"] = 0 # Cb_stats vb-details validation failed = self.durability_helper.verify_vbucket_details_stats( self.bucket_util.buckets[0], self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details validation failed") self.summary.add_step("Cbstats vb-details verification") self.validate_test_failure() possible_d_levels = dict() possible_d_levels[Bucket.Type.MEMBASE] = \ self.bucket_util.get_supported_durability_levels() possible_d_levels[Bucket.Type.EPHEMERAL] = [ Bucket.DurabilityLevel.NONE, Bucket.DurabilityLevel.MAJORITY] len_possible_d_levels = len(possible_d_levels[self.bucket_type]) - 1 if not sync_write_support: self.verification_dict["rollback_item_count"] = 0 self.verification_dict["sync_write_aborted_count"] = 0 # Perform bucket_durability update key, value = doc_generator("b_durability_doc", 0, 1).next() client = SDKClient([self.cluster.master], self.bucket_util.buckets[0]) for index, d_level in enumerate(possible_d_levels[self.bucket_type]): self.log.info("Updating bucket_durability=%s" % d_level) self.bucket_util.update_bucket_property( self.bucket_util.buckets[0], bucket_durability=BucketDurability[d_level]) self.bucket_util.print_bucket_stats() buckets = self.bucket_util.get_all_buckets() if buckets[0].durability_level != BucketDurability[d_level]: self.log_failure("New bucket_durability not taken") self.summary.add_step("Update bucket_durability=%s" % d_level) self.sleep(10, "MB-39678: Bucket_d_level change to take effect") if index == 0: op_type = DocLoading.Bucket.DocOps.CREATE self.verification_dict["ops_create"] += 1 elif index == len_possible_d_levels: op_type = DocLoading.Bucket.DocOps.DELETE self.verification_dict["ops_delete"] += 1 else: op_type = DocLoading.Bucket.DocOps.UPDATE if "ops_update" in self.verification_dict: self.verification_dict["ops_update"] += 1 result = client.crud(op_type, key, value, timeout=self.sdk_timeout) if result["status"] is False: self.log_failure("Doc_op %s failed on key %s: %s" % (op_type, key, result["error"])) self.summary.add_step("Doc_op %s" % op_type) client.close() # Cb_stats vb-details validation failed = self.durability_helper.verify_vbucket_details_stats( self.bucket_util.buckets[0], self.cluster_util.get_kv_nodes(), vbuckets=self.cluster_util.vbuckets, expected_val=self.verification_dict) if failed: self.log_failure("Cbstat vbucket-details validation failed") self.summary.add_step("Cbstats vb-details verification") self.validate_test_failure() def test_transaction_doc_isolation(self): def run_transaction_updates(): self.log.info("Starting transaction updates in parallel") while not stop_thread: commit_trans = choice([True, False]) trans_update_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, self.gen_load, DocLoading.Bucket.DocOps.UPDATE, exp=self.maxttl, batch_size=50, process_concurrency=3, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=commit_trans, durability=self.durability_level, sync=self.sync, defer=self.defer, retries=0) self.task_manager.get_task_result(trans_update_task) stop_thread = False update_task = None self.sdk_timeout = 60 self.log.info("Upgrading cluster nodes to target version") node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) if self.upgrade_with_data_load: update_task = Thread(target=run_transaction_updates) update_task.start() self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version) try: self.cluster.update_master_using_diag_eval( self.cluster.servers[0]) except Exception: self.cluster.update_master_using_diag_eval( self.cluster.servers[self.nodes_init-1]) if self.upgrade_with_data_load: stop_thread = True update_task.join() self.cluster_util.print_cluster_stats() self.bucket_util.print_bucket_stats() self.summary.add_step("Upgrade %s" % node_to_upgrade.ip) # Halt further upgrade if test has failed during current upgrade if self.test_failure: break node_to_upgrade = self.fetch_node_to_upgrade() for bucket in self.bucket_util.get_all_buckets(): tombstone_doc_supported = \ "tombstonedUserXAttrs" in bucket.bucketCapabilities if node_to_upgrade is None and not tombstone_doc_supported: self.log_failure("Tombstone docs not added to %s " "capabilities" % bucket.name) elif node_to_upgrade is not None and tombstone_doc_supported: self.log_failure("Tombstone docs supported for %s before " "cluster upgrade" % bucket.name) self.validate_test_failure() create_gen = doc_generator(self.key, self.num_items, self.num_items*2) # Start transaction load after node upgrade trans_task = self.task.async_load_gen_docs_atomicity( self.cluster, self.bucket_util.buckets, create_gen, DocLoading.Bucket.DocOps.CREATE, exp=self.maxttl, batch_size=50, process_concurrency=8, timeout_secs=self.sdk_timeout, update_count=self.update_count, transaction_timeout=self.transaction_timeout, commit=True, durability=self.durability_level, sync=self.sync, defer=self.defer, retries=0) self.task_manager.get_task_result(trans_task) def test_cbcollect_info(self): self.parse = self.input.param("parse", False) self.metric_name = self.input.param("metric_name", "kv_curr_items") log_path = self.input.param("logs_folder") self.log.info("Starting update tasks") update_tasks = list() update_tasks.append(self.task.async_continuous_doc_ops( self.cluster, self.bucket, self.gen_load, op_type=DocLoading.Bucket.DocOps.UPDATE, persist_to=1, replicate_to=1, process_concurrency=1, batch_size=10, timeout_secs=30)) update_tasks.append(self.task.async_continuous_doc_ops( self.cluster, self.bucket, self.gen_load, op_type=DocLoading.Bucket.DocOps.UPDATE, replicate_to=1, process_concurrency=1, batch_size=10, timeout_secs=30)) update_tasks.append(self.task.async_continuous_doc_ops( self.cluster, self.bucket, self.gen_load, op_type=DocLoading.Bucket.DocOps.UPDATE, persist_to=1, process_concurrency=1, batch_size=10, timeout_secs=30)) node_to_upgrade = self.fetch_node_to_upgrade() while node_to_upgrade is not None: # Cbcollect with mixed mode cluster status = self.__trigger_cbcollect(log_path) if status is False: break self.log.info("Selected node for upgrade: %s" % node_to_upgrade.ip) self.upgrade_function[self.upgrade_type](node_to_upgrade, self.upgrade_version) self.cluster_util.print_cluster_stats() try: self.cluster.update_master_using_diag_eval( self.cluster.servers[0]) except Exception: self.cluster.update_master_using_diag_eval( self.cluster.servers[self.nodes_init-1]) # TODO: Do some validations here try: self.get_all_metrics(self.parse, self.metric_name) except Exception: pass node_to_upgrade = self.fetch_node_to_upgrade() # Halt further upgrade if test has failed during current upgrade if self.test_failure is True: break # Metrics should work in fully upgraded cluster self.get_all_metrics(self.parse, self.metric_name) # Cbcollect with fully upgraded cluster self.__trigger_cbcollect(log_path) for update_task in update_tasks: # Wait for update_task to complete update_task.end_task() self.task_manager.get_task_result(update_task) self.validate_test_failure() def get_low_cardinality_metrics(self, parse): content = None for server in self.cluster_util.get_kv_nodes(): content = StatsHelper(server).get_prometheus_metrics(parse=parse) if not parse: StatsHelper(server)._validate_metrics(content) for line in content: self.log.info(line.strip("\n")) def get_high_cardinality_metrics(self,parse): content = None try: for server in self.cluster_util.get_kv_nodes(): content = StatsHelper(server).get_prometheus_metrics_high( parse=parse) if not parse: StatsHelper(server)._validate_metrics(content) for line in content: self.log.info(line.strip("\n")) except: pass def get_range_api_metrics(self, metric_name): label_values = {"bucket": self.bucket_util.buckets[0].name, "nodes": self.cluster.master.ip} content = StatsHelper(self.cluster.master).get_range_api_metrics( metric_name, label_values=label_values) self.log.info(content) def get_instant_api(self, metric_name): pass def get_all_metrics(self, parse, metrics): self.get_low_cardinality_metrics(parse) self.get_high_cardinality_metrics(parse) self.get_range_api_metrics(metrics) self.get_instant_api(metrics)
def test_with_process_crash(self): """ Test to make sure durability will succeed even if a node goes down due to crash and has enough nodes to satisfy the durability 1. Select a node from the cluster to simulate the specified error 2. Perform CRUD on the target bucket with given timeout 3. Using cbstats to verify the operation succeeds 4. Validate all mutations are succeeded Note: self.sdk_timeout values is considered as 'seconds' """ if self.num_replicas < 2: self.assertTrue(False, msg="Required: num_replicas > 1") # Override num_of_nodes affected to 1 (Positive case) self.num_nodes_affected = 1 error_sim = dict() shell_conn = dict() cbstat_obj = dict() failover_info = dict() vb_info_info = dict() active_vbs_in_target_nodes = list() failover_info["init"] = dict() failover_info["afterCrud"] = dict() vb_info_info["init"] = dict() vb_info_info["afterCrud"] = dict() self.log.info("Selecting nodes to simulate error condition") target_nodes = DurabilityHelper.getTargetNodes(self.cluster, self.nodes_init, self.num_nodes_affected) self.log.info("Will simulate error condition on %s" % target_nodes) for node in target_nodes: shell_conn[node.ip] = RemoteMachineShellConnection(node) cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip]) active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list( self.bucket.name, "active") vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno( self.bucket.name) failover_info["init"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Remove active vbuckets from doc_loading to avoid errors load_spec = dict() load_spec["doc_crud"] = dict() load_spec["doc_crud"][ MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100 load_spec["doc_crud"][ MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25 load_spec["doc_crud"][ MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections" load_spec["target_vbuckets"] = list( set(range(0, 1024)) ^ set(active_vbs_in_target_nodes)) self.log.info("Perform 'create', 'update', 'delete' mutations") doc_loading_task = \ self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, load_spec, mutation_num=1, async_load=True) self.sleep(5, "Wait for doc loaders to start loading data") for node in target_nodes: # Create shell_connections shell_conn[node.ip] = RemoteMachineShellConnection(node) # Perform specified action error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip]) error_sim[node.ip].create(self.simulate_error, bucket_name=self.bucket.name) # Perform new scope/collection creation during doc ops in parallel self.__perform_collection_crud() # Wait for document_loader tasks to complete self.task_manager.get_task_result(doc_loading_task) self.bucket_util.validate_doc_loading_results(doc_loading_task) if doc_loading_task.result is False: self.log_failure("Doc CRUDs failed with process crash") if self.simulate_error \ not in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]: # Revert the induced error condition for node in target_nodes: error_sim[node.ip].revert(self.simulate_error, bucket_name=self.bucket.name) # Disconnect the shell connection shell_conn[node.ip].disconnect() self.sleep(10, "Wait for node recovery to complete") # In case of error with Ephemeral bucket, need to rebalance # to make sure data is redistributed properly if self.bucket_type == Bucket.Type.EPHEMERAL: retry_num = 0 result = None while retry_num != 2: result = self.task.rebalance( self.servers[0:self.nodes_init], [], []) if result: break retry_num += 1 self.sleep(10, "Wait before retrying rebalance") self.assertTrue(result, "Rebalance failed") # Fetch latest failover stats and validate the values are updated self.log.info("Validating failover and seqno cbstats") for node in target_nodes: vb_info_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].vbucket_seqno(self.bucket.name) failover_info["afterCrud"][node.ip] = \ cbstat_obj[node.ip].failover_stats(self.bucket.name) # Failover stat validation if self.simulate_error == CouchbaseError.KILL_MEMCACHED: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: if self.simulate_error != CouchbaseError.STOP_MEMCACHED \ and self.bucket_type == Bucket.Type.EPHEMERAL: val = failover_info["init"][node.ip] \ != failover_info["afterCrud"][node.ip] else: val = failover_info["init"][node.ip] \ == failover_info["afterCrud"][node.ip] error_msg = "Failover stats mismatch after error condition:" \ " %s != %s" \ % (failover_info["init"][node.ip], failover_info["afterCrud"][node.ip]) self.assertTrue(val, msg=error_msg) # Seq_no validation (High level) val = \ vb_info_info["init"][node.ip] \ != vb_info_info["afterCrud"][node.ip] self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs") # Doc count validation self.validate_test_failure() self.bucket_util.validate_docs_per_collections_all_buckets()
def setUp(self): super(DurabilitySuccessTests, self).setUp() self.durability_helper = DurabilityHelper( self.log, len(self.cluster.nodes_in_cluster), self.durability_level) self.log.info("=== DurabilitySuccessTests setup complete ===")