class StorageBase(BaseTestCase): def setUp(self): super(StorageBase, self).setUp() self.rest = RestConnection(self.cluster.master) self.data_path = self.fetch_data_path() # Bucket Params self.vbuckets = self.input.param("vbuckets", self.cluster.vbuckets) self.bucket_ram_quota = self.input.param("bucket_ram_quota", None) self.fragmentation = int(self.input.param("fragmentation", 50)) self.bucket_storage = self.input.param("bucket_storage", Bucket.StorageBackend.magma) self.bucket_eviction_policy = self.input.param( "bucket_eviction_policy", Bucket.EvictionPolicy.FULL_EVICTION) self.bucket_util.add_rbac_user(self.cluster.master) self.bucket_name = self.input.param("bucket_name", None) self.magma_buckets = self.input.param("magma_buckets", 0) # SDK Exceptions self.check_temporary_failure_exception = False self.retry_exceptions = [ SDKException.TimeoutException, SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException, SDKException.UnambiguousTimeoutException, SDKException.ServerOutOfMemoryException, SDKException.DurabilityAmbiguousException ] self.ignore_exceptions = [] # Sets autocompaction at bucket level self.autoCompactionDefined = str( self.input.param("autoCompactionDefined", "false")).lower() # Create Cluster self.rest.init_cluster(username=self.cluster.master.rest_username, password=self.cluster.master.rest_password) nodes_init = self.cluster.servers[1:self.nodes_init] self.services = ["kv"] * self.nodes_init self.dcp_services = self.input.param("dcp_services", None) self.dcp_servers = [] if self.dcp_services: server = self.rest.get_nodes_self() self.rest.set_service_mem_quota({ CbServer.Settings.INDEX_MEM_QUOTA: int(server.mcdMemoryReserved - 100) }) self.dcp_services = [ service.replace(":", ",") for service in self.dcp_services.split("-") ] self.services.extend(self.dcp_services) self.dcp_servers = self.cluster.servers[self.nodes_init:self. nodes_init + len(self.dcp_services)] nodes_in = nodes_init + self.dcp_servers result = self.task.rebalance([self.cluster.master], nodes_in, [], services=self.services[1:]) self.assertTrue(result, "Initial rebalance failed") self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_in) for idx, node in enumerate(self.cluster.nodes_in_cluster): node.services = self.services[idx] # Create Buckets if self.standard_buckets == 1: self.bucket_util.create_default_bucket( self.cluster, bucket_type=self.bucket_type, ram_quota=self.bucket_ram_quota, replica=self.num_replicas, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy, autoCompactionDefined=self.autoCompactionDefined, fragmentation_percentage=self.fragmentation, flush_enabled=self.flush_enabled) else: buckets_created = self.bucket_util.create_multiple_buckets( self.cluster, self.num_replicas, bucket_count=self.standard_buckets, bucket_type=self.bucket_type, storage={ "couchstore": self.standard_buckets - self.magma_buckets, "magma": self.magma_buckets }, eviction_policy=self.bucket_eviction_policy, bucket_name=self.bucket_name, fragmentation_percentage=self.fragmentation, flush_enabled=self.flush_enabled) self.assertTrue(buckets_created, "Unable to create multiple buckets") self.buckets = self.cluster.buckets # sel.num_collections=1 signifies only default collection self.num_collections = self.input.param("num_collections", 1) self.num_scopes = self.input.param("num_scopes", 1) # Creation of scopes of num_scopes is > 1 scope_prefix = "Scope" for bucket in self.cluster.buckets: for i in range(1, self.num_scopes): scope_name = scope_prefix + str(i) self.log.info("Creating bucket::scope {} {}\ ".format(bucket.name, scope_name)) self.bucket_util.create_scope(self.cluster.master, bucket, {"name": scope_name}) self.sleep(2) self.scopes = self.buckets[0].scopes.keys() self.log.info("Scopes list is {}".format(self.scopes)) collection_prefix = "FunctionCollection" # Creation of collection of num_collections is > 1 for bucket in self.cluster.buckets: for scope_name in self.scopes: for i in range(1, self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection {} {}\ ".format(scope_name, collection_name)) self.bucket_util.create_collection( self.cluster.master, bucket, scope_name, {"name": collection_name}) self.sleep(2) self.collections = self.buckets[0].scopes[ CbServer.default_scope].collections.keys() self.log.debug("Collections list == {}".format(self.collections)) if self.dcp_services and self.num_collections == 1: self.initial_idx = "initial_idx" self.initial_idx_q = "CREATE INDEX %s on default:`%s`.`%s`.`%s`(meta().id) with \ {\"defer_build\": false};" % ( self.initial_idx, self.buckets[0].name, CbServer.default_scope, self.collections[0]) self.query_client = RestConnection(self.dcp_servers[0]) result = self.query_client.query_tool(self.initial_idx_q) self.assertTrue(result["status"] == "success", "Index query failed!") # Doc controlling params self.key = 'test_docs' self.key_size = self.input.param("key_size", 8) if self.random_key: self.key = "random_keys" ''' With Small key size, when random.random() generate 0.0, Key size becomes bigger than the 250 bytes (L 259 in documentgenerator.py) ''' self.key_size = self.input.param("key_size", 20) self.doc_ops = self.input.param("doc_ops", "create") self.doc_size = self.input.param("doc_size", 2048) self.gen_create = None self.gen_delete = None self.gen_read = None self.gen_update = None self.gen_expiry = None self.create_perc = self.input.param("update_perc", 100) self.update_perc = self.input.param("update_perc", 0) self.delete_perc = self.input.param("delete_perc", 0) self.expiry_perc = self.input.param("expiry_perc", 0) self.start = 0 self.end = 0 self.create_start = None self.create_end = None self.update_start = None self.update_end = None self.delete_start = None self.delete_end = None self.read_start = None self.read_end = None self.expiry_start = None self.expiry_end = None self.mutate = 0 self.init_items_per_collection = self.num_items ''' --For DGM test -self.init_items_per collection will overwrite in load_buckets_in_dgm method --For Non-DGM tests in MultiCollection environment, -self.num_items will be updated after doc loading -- self.init_num_items is needed to preserve initial doc count given in test ''' self.init_num_items = self.num_items self.maxttl = self.input.param("maxttl", 10) # Common test params self.test_itr = self.input.param("test_itr", 4) self.update_itr = self.input.param("update_itr", 2) self.next_half = self.input.param("next_half", False) self.deep_copy = self.input.param("deep_copy", False) self.suppress_error_table = True self.skip_read_on_error = False self.track_failures = True def _loader_dict(self): loader_dict = dict() common_params = { "retry_exceptions": self.retry_exceptions, "suppress_error_table": self.suppress_error_table, "durability_level": self.durability_level, "skip_read_success_results": False, "target_items": 5000, "skip_read_on_error": self.skip_read_on_error, "track_failures": self.track_failures, "ignore_exceptions": self.ignore_exceptions, "sdk_timeout_unit": self.time_unit, "sdk_timeout": self.sdk_timeout, "doc_ttl": 0, "doc_gen_type": "default" } for bucket in self.cluster.buckets: loader_dict.update({bucket: dict()}) loader_dict[bucket].update({"scopes": dict()}) for scope in bucket.scopes.keys(): loader_dict[bucket]["scopes"].update({scope: dict()}) loader_dict[bucket]["scopes"][scope].update( {"collections": dict()}) for collection in bucket.scopes[scope].collections.keys(): loader_dict[bucket]["scopes"][scope]["collections"].update( {collection: dict()}) if self.gen_update is not None: op_type = "update" common_params.update({"doc_gen": self.gen_update}) loader_dict[bucket]["scopes"][scope]["collections"][ collection][op_type] = copy.deepcopy(common_params) if self.gen_create is not None: op_type = "create" common_params.update({"doc_gen": self.gen_create}) loader_dict[bucket]["scopes"][scope]["collections"][ collection][op_type] = copy.deepcopy(common_params) if self.gen_delete is not None: op_type = "delete" common_params.update({"doc_gen": self.gen_delete}) loader_dict[bucket]["scopes"][scope]["collections"][ collection][op_type] = copy.deepcopy(common_params) if self.gen_expiry is not None and self.maxttl: op_type = "update" common_params.update({ "doc_gen": self.gen_expiry, "doc_ttl": self.maxttl }) loader_dict[bucket]["scopes"][scope]["collections"][ collection][op_type] = copy.deepcopy(common_params) common_params.update({"doc_ttl": 0}) if self.gen_read is not None: op_type = "read" common_params.update({ "doc_gen": self.gen_read, "skip_read_success_results": True, "track_failures": False, "suppress_error_table": True }) loader_dict[bucket]["scopes"][scope]["collections"][ collection][op_type] = common_params self.loader_dict = loader_dict def doc_loader(self, loader_spec): task = self.task.async_load_gen_docs_from_spec( self.cluster, self.task_manager, loader_spec, self.sdk_client_pool, batch_size=self.batch_size, process_concurrency=self.process_concurrency, print_ops_rate=True, start_task=True, track_failures=self.track_failures) return task def data_load(self): self._loader_dict() return self.doc_loader(self.loader_dict) def wait_for_doc_load_completion(self, task, wait_for_stats=True): self.task_manager.get_task_result(task) self.bucket_util.validate_doc_loading_results(task) if not task.result: self.assertTrue( task.result, "Doc ops failed for task: {}".format(task.thread_name)) if wait_for_stats: try: self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=1800) except Exception as e: raise e def initial_load(self): self.create_start = 0 self.create_end = self.init_items_per_collection if self.rev_write: self.create_start = -int(self.init_items_per_collection - 1) self.create_end = 1 self.generate_docs(doc_ops="create") self.log.debug("initial_items_in_each_collection {}".format( self.init_items_per_collection)) task = self.data_load() self.wait_for_doc_load_completion(task) self.num_items = self.init_items_per_collection * self.num_collections self.read_start = 0 self.read_end = self.init_items_per_collection def load_buckets_in_dgm(self, kv_gen, op_type, exp, flag=0, batch_size=1000, timeout_secs=30, compression=True, skip_read_on_error=False, suppress_error_table=False, track_failures=False): tasks_info = dict() self.collections.remove(CbServer.default_collection) docs_per_task = dict() docs_per_scope = dict.fromkeys(self.scopes, dict()) for scope in self.scopes: task_per_collection = dict() if scope == CbServer.default_scope: self.collections.append(CbServer.default_collection) for collection in self.collections: task_info = self.bucket_util._async_load_all_buckets( self.cluster, kv_gen, op_type, exp, flag, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=timeout_secs, time_unit=self.time_unit, batch_size=batch_size, sdk_compression=compression, process_concurrency=self.process_concurrency, retry_exceptions=self.retry_exceptions, active_resident_threshold=self.active_resident_threshold, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, dgm_batch=self.dgm_batch, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures, sdk_client_pool=self.sdk_client_pool) tasks_info.update(task_info.items()) task_per_collection[collection] = list(task_info.keys())[0] if scope == CbServer.default_scope: self.collections.remove(CbServer.default_collection) docs_per_scope[scope] = task_per_collection for task in tasks_info.keys(): self.task_manager.get_task_result(task) if self.active_resident_threshold < 100: for task, _ in tasks_info.items(): docs_per_task[task] = task.doc_index self.log.info("docs_per_task : {}".format(docs_per_task)) for scope in self.scopes: for collection in self.collections: docs_per_scope[scope][collection] = docs_per_task[ docs_per_scope[scope][collection]] docs_per_scope[CbServer.default_scope][ CbServer.default_collection] = docs_per_task[docs_per_scope[ CbServer.default_scope][CbServer.default_collection]] self.log.info("docs_per_scope : {}".format(docs_per_scope)) # For DGM TESTS, init_items_per_collection == max(list of items in each collection) self.init_items_per_collection = max( [max(docs_per_scope[scope].values()) for scope in docs_per_scope]) self.log.info("init_items_per_collection =={} ".format( self.init_items_per_collection)) def tearDown(self): self.cluster_util.print_cluster_stats(self.cluster) dgm = None timeout = 60 while dgm is None and timeout > 0: try: stats = BucketHelper(self.cluster.master).fetch_bucket_stats( self.buckets[0].name) dgm = stats["op"]["samples"]["vb_active_resident_items_ratio"][ -1] self.log.info( "## Active Resident Threshold of {0} is {1} ##".format( self.buckets[0].name, dgm)) except: self.log.debug( "Fetching vb_active_resident_items_ratio(dgm) failed...retying" ) timeout -= 1 time.sleep(1) super(StorageBase, self).tearDown() def genrate_docs_basic(self, start, end, target_vbucket=None, mutate=0): return doc_generator(self.key, start, end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbucket, vbuckets=self.cluster.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size, mutate=mutate, deep_copy=self.deep_copy) def generate_docs(self, doc_ops=None, target_vbucket=None, create_end=None, create_start=None, create_mutate=0, update_end=None, update_start=None, update_mutate=0, read_end=None, read_start=None, read_mutate=0, delete_end=None, delete_start=None, expiry_end=None, expiry_start=None, expiry_mutate=0): doc_ops = doc_ops or self.doc_ops if "update" in doc_ops: if update_start is not None: self.update_start = update_start if update_end is not None: self.update_end = update_end if self.update_start is None: self.update_start = self.start if self.update_end is None: self.update_end = self.end * self.update_perc / 100 self.mutate += 1 self.gen_update = self.genrate_docs_basic( self.update_start, self.update_end, target_vbucket=target_vbucket, mutate=self.mutate) if "delete" in doc_ops: if delete_start is not None: self.delete_start = delete_start if delete_end is not None: self.delete_end = delete_end if self.delete_start is None: self.delete_start = self.start if self.delete_end is None: self.delete_end = self.end * self.delete_perc / 100 self.gen_delete = self.genrate_docs_basic( self.delete_start, self.delete_end, target_vbucket=target_vbucket, mutate=read_mutate) if "create" in doc_ops: if create_start is not None: self.create_start = create_start if self.create_start is None: self.create_start = self.end self.start = self.create_start if create_end is not None: self.create_end = create_end if self.create_end is None: self.create_end = self.start + self.num_items * self.create_perc / 100 self.end = self.create_end self.gen_create = self.genrate_docs_basic( self.create_start, self.create_end, target_vbucket=target_vbucket, mutate=create_mutate) if "read" in doc_ops: if read_start is not None: self.read_start = read_start if read_end is not None: self.read_end = read_end if self.read_start is None: self.read_start = self.create_start if self.read_end is None: self.read_end = self.create_end self.gen_read = self.genrate_docs_basic( self.read_start, self.read_end, target_vbucket=target_vbucket, mutate=read_mutate) if "expiry" in doc_ops: if expiry_start is not None: self.expiry_start = expiry_start elif self.expiry_start is None: self.expiry_start = self.start + (self.num_items * self.delete_perc) / 100 if expiry_end is not None: self.expiry_end = expiry_end elif self.expiry_end is None: self.expiry_end = self.start+self.num_items *\ (self.delete_perc + self.expiry_perc)/100 self.gen_expiry = self.genrate_docs_basic( self.expiry_start, self.expiry_end, target_vbucket=target_vbucket, mutate=expiry_mutate) def loadgen_docs(self, retry_exceptions=[], ignore_exceptions=[], skip_read_on_error=False, suppress_error_table=False, scope=CbServer.default_scope, collection=CbServer.default_collection, _sync=True, track_failures=True, doc_ops=None, sdk_retry_strategy=None): doc_ops = doc_ops or self.doc_ops tasks_info = dict() read_tasks_info = dict() read_task = False if self.check_temporary_failure_exception: retry_exceptions.append(SDKException.TemporaryFailureException) if "update" in doc_ops and self.gen_update is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, time_unit=self.time_unit, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures, sdk_client_pool=self.sdk_client_pool, sdk_retry_strategy=sdk_retry_strategy) tasks_info.update(tem_tasks_info.items()) if "create" in doc_ops and self.gen_create is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, time_unit=self.time_unit, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures, sdk_client_pool=self.sdk_client_pool, sdk_retry_strategy=sdk_retry_strategy) tasks_info.update(tem_tasks_info.items()) self.num_items += (self.gen_create.end - self.gen_create.start) if "expiry" in doc_ops and self.gen_expiry is not None and self.maxttl: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_expiry, "update", self.maxttl, self.random_exp, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, time_unit=self.time_unit, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures, sdk_client_pool=self.sdk_client_pool, sdk_retry_strategy=sdk_retry_strategy) tasks_info.update(tem_tasks_info.items()) self.num_items -= (self.gen_expiry.end - self.gen_expiry.start) if "read" in doc_ops and self.gen_read is not None: read_tasks_info = self.bucket_util._async_validate_docs( self.cluster, self.gen_read, "read", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout, time_unit=self.time_unit, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, scope=scope, collection=collection, suppress_error_table=suppress_error_table, sdk_client_pool=self.sdk_client_pool, sdk_retry_strategy=sdk_retry_strategy) read_task = True if "delete" in doc_ops and self.gen_delete is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_delete, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, time_unit=self.time_unit, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures, sdk_client_pool=self.sdk_client_pool, sdk_retry_strategy=sdk_retry_strategy) tasks_info.update(tem_tasks_info.items()) self.num_items -= (self.gen_delete.end - self.gen_delete.start) if _sync: for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions( tasks_info, self.cluster, sdk_client_pool=self.sdk_client_pool) self.bucket_util.log_doc_ops_task_failures(tasks_info) if read_task: # TODO: Need to converge read_tasks_info into tasks_info before # itself to avoid confusions during _sync=False case tasks_info.update(read_tasks_info.items()) if _sync: for task in read_tasks_info: self.task_manager.get_task_result(task) return tasks_info def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) count = 0 dgm = 100 while count < 5: try: dgm = self.rest_client.fetch_bucket_stats( bucket.name )["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) return dgm except Exception as e: self.sleep(5, e) count += 1 return dgm def change_swap_space(self, servers=None, disable=True): servers = servers or self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] for server in servers: shell = RemoteMachineShellConnection(server) if disable: _ = shell.execute_command("swapoff -a") self.sleep(5) output = shell.execute_command( "free | tail -1 | awk '{print $2}'")[0][0].split('\n')[0] self.assertEqual( int(output), 0, msg= "Failed to disable swap space on server {} having value {} \ ".format(server, output)) else: _ = shell.execute_command("swapon -a") self.sleep(5) output = shell.execute_command( "free | tail -1 | awk '{print $2}'")[0][0].split('\n')[0] self.assertNotEqual( int(output), 0, msg= "Failed to enable swap space on server {} having value {} \ ".format(server, output)) return def check_fragmentation_using_bucket_stats(self, bucket, servers=None): # Disabling the check for time being #return True result = dict() if servers is None: servers = self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] time_end = time.time() + 60 * 5 while time.time() < time_end: for server in servers: frag_val = self.bucket_util.get_fragmentation_kv( self.cluster, bucket, server) self.log.debug("Current Fragmentation for node {} is {} \ ".format(server.ip, frag_val)) result.update({server.ip: frag_val}) if (max(result.values())) <= 1.1 * (self.fragmentation): self.log.info( "KV stats fragmentation values {}".format(result)) return True self.log.info("KV stats fragmentation values {}".format(result)) return False def get_fragmentation_upsert_docs_list(self): """ This function gives the list of "number of docs" need to be updated to touch the given fragmentation value """ update_doc_count = int( math.ceil( float(self.fragmentation * self.num_items) / (100 - self.fragmentation))) upsert_doc_list = list() while update_doc_count > self.num_items: upsert_doc_list.append(self.num_items) update_doc_count -= self.num_items if update_doc_count > 0: upsert_doc_list.append(update_doc_count) self.log.info("Upsert list {}".format(upsert_doc_list)) return upsert_doc_list def validate_data(self, op_type, kv_gen, _sync=True): self.log.info("Validating Docs") validate_tasks_info = dict() for collection in self.collections: temp_tasks_info = self.bucket_util._async_validate_docs( self.cluster, kv_gen, op_type, 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, timeout_secs=self.sdk_timeout, scope=CbServer.default_scope, collection=collection, retry_exceptions=self.retry_exceptions, ignore_exceptions=self.ignore_exceptions, sdk_client_pool=self.sdk_client_pool) validate_tasks_info.update(temp_tasks_info.items()) if _sync: for task in validate_tasks_info: self.task_manager.get_task_result(task) else: return validate_tasks_info def sigkill_memcached(self, nodes=None, graceful=False): nodes = nodes or self.cluster.nodes_in_cluster for node in nodes: shell = RemoteMachineShellConnection(node) if graceful: shell.restart_couchbase() else: shell.kill_memcached() shell.disconnect() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster.master], self.cluster.buckets[0], wait_time=self.wait_timeout * 20)) def get_memory_footprint(self): out = subprocess.Popen( ['ps', 'v', '-p', str(os.getpid())], stdout=subprocess.PIPE).communicate()[0].split(b'\n') vsz_index = out[0].split().index(b'RSS') mem = float(out[1].split()[vsz_index]) / 1024 print("RAM FootPrint: %s" % str(mem)) def crash(self, nodes=None, kill_itr=1, graceful=False, wait=True, force_collect=False): self.stop_crash = False self.crash_failure = False count = kill_itr loop_itr = 0 msg = None nodes = nodes or self.cluster.nodes_in_cluster connections = dict() for node in nodes: shell = RemoteMachineShellConnection(node) connections.update({node: shell}) while not self.stop_crash: loop_itr += 1 sleep = random.randint(30, 60) self.sleep( sleep, "Iteration:{} waiting for {} sec to kill memcached on all nodes" .format(loop_itr, sleep)) for node, shell in connections.items(): if "kv" in node.services: if graceful: shell.restart_couchbase() else: while count > 0: shell.kill_memcached() self.sleep( 3, "Sleep before killing memcached on same node again." ) count -= 1 count = kill_itr result = self.check_coredump_exist(self.cluster.nodes_in_cluster, force_collect=force_collect) if result: self.stop_crash = True self.task.jython_task_manager.abort_all_tasks() self.crash_failure = result msg = "CRASH | CRITICAL | WARN messages found in cb_logs" self.log.critical(msg) if wait: for node in nodes: if "kv" in node.services: result = self.bucket_util._wait_warmup_completed( [node], self.cluster.buckets[0], wait_time=self.wait_timeout * 5) if not result: msg = "warm-up couldn't complete in %s seconds" %\ (self.wait_timeout * 5) self.log.critical(msg) self.task.jython_task_manager.abort_all_tasks() self.stop_crash = True self.crash_failure = True for _, shell in connections.items(): shell.disconnect() def chmod(self, server, path, mod="000"): ''' # (Base-10) Binary Sum (in binary) Sum (in decimal) rwx Permission 7 111 = 100 + 10 + 1 = 4(r) + 2(w) + 1(x) rwx read, write and execute 6 110 = 100 + 10 = 4(r) + 2(w) rw- read and write 5 101 = 100 + 1 = 4(r) + 1(x) r-x read and execute 4 100 = 100 = 4(r) r-- read only 3 011 = 10 + 1 = 2(w) + 1(x) -wx write and execute 2 010 = 10 = 2(w) -w- write only 1 001 = 1 = 1(x) --x execute only 0 000 = 0 = 0 --- none ''' self.stop_chmod = False while self.stop_chmod is False: shell = RemoteMachineShellConnection(server) self.log.debug("{}: changing mod to {} for {}".format( server.ip, mod, path)) shell.execute_command("chmod {} {}".format(mod, path)) self.sleep(5) self.log.debug("{}: changing mod to {} for {}".format( server.ip, "777", path)) shell.execute_command("chmod {} {}".format("777", path)) self.sleep(5) shell.disconnect() def set_metadata_purge_interval(self, value, buckets=[], node=None): self.log.info( "Changing the bucket properties by changing {0} to {1}".format( "purge_interval", value)) if not buckets: buckets = self.buckets if node is None: node = self.cluster.master rest = RestConnection(node) shell = RemoteMachineShellConnection(node) shell.enable_diag_eval_on_non_local_hosts() shell.disconnect() for bucket in buckets: cmd = '{ok, BC} = ns_bucket:get_bucket(' \ '"%s"), BC2 = lists:keyreplace(purge_interval, ' \ '1, BC, {purge_interval, %f})' \ ', ns_bucket:set_bucket_config("%s", BC2).' \ % (bucket.name, value, bucket.name) rest.diag_eval(cmd) # Restart Memcached in all cluster nodes to reflect the settings for server in self.cluster_util.get_kv_nodes(self.cluster, master=node): shell = RemoteMachineShellConnection(server) shell.restart_couchbase() shell.disconnect() # Check bucket-warm_up after Couchbase restart retry_count = 10 buckets_warmed_up = self.bucket_util.is_warmup_complete( self.cluster, buckets, retry_count) if not buckets_warmed_up: self.log.critical("Few bucket(s) not warmed up " "within expected time") def fetch_data_path(self): data_path = self.rest.get_data_path() if "c:/Program Files" in data_path: data_path = data_path.replace("c:/Program Files", "/cygdrive/c/Program\ Files") return data_path
class CollectionsRebalance(CollectionBase): def setUp(self): super(CollectionsRebalance, self).setUp() self.bucket_util._expiry_pager() self.load_gen = doc_generator(self.key, 0, self.num_items) self.bucket = self.bucket_util.buckets[0] self.rest = RestConnection(self.cluster.master) self.data_load_spec = self.input.param("data_load_spec", "volume_test_load") self.data_load_stage = self.input.param("data_load_stage", "before") self.data_load_type = self.input.param("data_load_type", "async") self.nodes_swap = self.input.param("nodes_swap", 1) self.nodes_failover = self.input.param("nodes_failover", 1) self.failover_ops = [ "graceful_failover_rebalance_out", "hard_failover_rebalance_out", "graceful_failover_recovery", "hard_failover_recovery" ] self.step_count = self.input.param("step_count", -1) self.recovery_type = self.input.param("recovery_type", "full") self.compaction = self.input.param("compaction", False) if self.compaction: self.disable_auto_compaction() self.warmup = self.input.param("warmup", False) self.update_replica = self.input.param( "update_replica", False) # for replica + rebalance tests self.updated_num_replicas = self.input.param( "updated_num_replicas", 1) # for replica + rebalance tests, forced hard failover self.forced_hard_failover = self.input.param( "forced_hard_failover", False) # for forced hard failover tests self.change_ram_quota_cluster = self.input.param( "change_ram_quota_cluster", False) # To change during rebalance self.skip_validations = self.input.param("skip_validations", True) if self.compaction: self.compaction_tasks = list() self.dgm_test = self.input.param("dgm_test", False) def tearDown(self): super(CollectionsRebalance, self).tearDown() def disable_auto_compaction(self): buckets = self.bucket_util.get_all_buckets() for bucket in buckets: if bucket.bucketType == "couchbase": self.bucket_util.disable_compaction(bucket=str(bucket.name)) def compact_all_buckets(self): self.sleep(10, "wait for rebalance to start") self.log.info("Starting compaction for each bucket") for bucket in self.bucket_util.buckets: self.compaction_tasks.append( self.task.async_compact_bucket(self.cluster.master, bucket)) def warmup_node(self, node): self.log.info("Warmuping up node...") shell = RemoteMachineShellConnection(node) shell.stop_couchbase() self.sleep(30) shell.start_couchbase() shell.disconnect() self.log.info("Done warming up...") def set_ram_quota_cluster(self): self.sleep(45, "Wait for rebalance have some progress") self.log.info("Changing cluster RAM size") status = self.rest.init_cluster_memoryQuota( self.cluster.master.rest_username, self.cluster.master.rest_password, memoryQuota=2500) self.assertTrue(status, "RAM quota wasn't changed") def set_retry_exceptions(self, doc_loading_spec): retry_exceptions = [] if self.data_load_stage == "during" or ( self.data_load_stage == "before" and self.data_load_type == "async"): retry_exceptions.append(SDKException.AmbiguousTimeoutException) retry_exceptions.append(SDKException.TimeoutException) retry_exceptions.append(SDKException.RequestCanceledException) if self.durability_level: retry_exceptions.append( SDKException.DurabilityAmbiguousException) retry_exceptions.append( SDKException.DurabilityImpossibleException) doc_loading_spec[MetaCrudParams.RETRY_EXCEPTIONS] = retry_exceptions def get_active_resident_threshold(self, bucket_name): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket_name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] return dgm def load_to_dgm(self, threshold=100): # load data until resident % goes below 100 bucket_name = self.bucket_util.buckets[0].name curr_active = self.get_active_resident_threshold(bucket_name) while curr_active >= threshold: self.subsequent_data_load(data_load_spec="dgm_load") curr_active = self.get_active_resident_threshold(bucket_name) self.log.info("curr_active resident {0} %".format(curr_active)) self.bucket_util._wait_for_stats_all_buckets() self.log.info( "Initial dgm load done. Resident {0} %".format(curr_active)) def data_load_after_failover(self): self.log.info("Starting a sync data load after failover") self.subsequent_data_load() # sync data load # Until we recover/rebalance-out, we can't call - self.bucket_util.validate_docs_per_collections_all_buckets() self.bucket_util._wait_for_stats_all_buckets() def wait_for_failover_or_assert(self, expected_failover_count, timeout=180): time_start = time.time() time_max_end = time_start + timeout actual_failover_count = 0 while time.time() < time_max_end: actual_failover_count = self.get_failover_count() if actual_failover_count == expected_failover_count: break time.sleep(20) time_end = time.time() if actual_failover_count != expected_failover_count: self.log.info(self.rest.print_UI_logs()) self.assertTrue( actual_failover_count == expected_failover_count, "{0} nodes failed over, expected : {1}".format( actual_failover_count, expected_failover_count)) self.log.info( "{0} nodes failed over as expected in {1} seconds".format( actual_failover_count, time_end - time_start)) def get_failover_count(self): rest = RestConnection(self.cluster.master) cluster_status = rest.cluster_status() failover_count = 0 # check for inactiveFailed for node in cluster_status['nodes']: if node['clusterMembership'] == "inactiveFailed": failover_count += 1 return failover_count def forced_failover_operation(self, known_nodes=None, failover_nodes=None, wait_for_pending=120): self.log.info("Updating all the bucket replicas to {0}".format( self.updated_num_replicas)) self.bucket_util.update_all_bucket_replicas(self.updated_num_replicas) failover_count = 0 for failover_node in failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=False, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) operation = self.task.async_rebalance(known_nodes, [], failover_nodes) self.data_load_after_failover() return operation def rebalance_operation(self, rebalance_operation, known_nodes=None, add_nodes=None, remove_nodes=None, failover_nodes=None, wait_for_pending=120, tasks=None): self.log.info("Starting rebalance operation of type : {0}".format( rebalance_operation)) step_count = self.step_count if rebalance_operation == "rebalance_out": if step_count == -1: if self.warmup: node = known_nodes[-1] self.warmup_node(node) operation = self.task.async_rebalance( known_nodes, [], remove_nodes) self.task.jython_task_manager.get_task_result(operation) if not operation.result: self.log.info("rebalance was failed as expected") for bucket in self.bucket_util.buckets: self.assertTrue( self.bucket_util._wait_warmup_completed( [node], bucket)) self.log.info("second attempt to rebalance") self.sleep( 60, "wait before starting rebalance after warmup") operation = self.task.async_rebalance( known_nodes, [], remove_nodes) self.wait_for_rebalance_to_complete(operation) self.sleep(60) else: if self.update_replica: self.log.info( "Updating all the bucket replicas to {0}".format( self.updated_num_replicas)) self.bucket_util.update_all_bucket_replicas( self.updated_num_replicas) self.bucket_util.print_bucket_stats() # all at once operation = self.task.async_rebalance( known_nodes, [], remove_nodes) if self.compaction: self.compact_all_buckets() if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count remove_list = [] for i in range(0, len(remove_nodes), step_count): if i + step_count >= len(remove_nodes): remove_list.append(remove_nodes[i:]) else: remove_list.append(remove_nodes[i:i + step_count]) iter_count = 0 # start each intermediate rebalance and wait for it to finish before # starting new one for new_remove_nodes in remove_list: operation = self.task.async_rebalance( known_nodes, [], new_remove_nodes) known_nodes = [ node for node in known_nodes if node not in new_remove_nodes ] iter_count = iter_count + 1 # if this is last intermediate rebalance, don't wait if iter_count == len(remove_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "rebalance_in": if step_count == -1: if self.warmup: node = known_nodes[-1] self.warmup_node(node) operation = self.task.async_rebalance( known_nodes, add_nodes, []) self.task.jython_task_manager.get_task_result(operation) if not operation.result: self.log.info("rebalance was failed as expected") for bucket in self.bucket_util.buckets: self.assertTrue( self.bucket_util._wait_warmup_completed( [node], bucket)) self.log.info("second attempt to rebalance") self.sleep( 60, "wait before starting rebalance after warmup") operation = self.task.async_rebalance( known_nodes + add_nodes, [], []) self.wait_for_rebalance_to_complete(operation) self.sleep(60) else: if self.update_replica: self.log.info( "Updating all the bucket replicas to {0}".format( self.updated_num_replicas)) self.bucket_util.update_all_bucket_replicas( self.updated_num_replicas) self.bucket_util.print_bucket_stats() # all at once operation = self.task.async_rebalance( known_nodes, add_nodes, []) if self.compaction: self.compact_all_buckets() if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count add_list = [] for i in range(0, len(add_nodes), step_count): if i + step_count >= len(add_nodes): add_list.append(add_nodes[i:]) else: add_list.append(add_nodes[i:i + step_count]) iter_count = 0 # start each intermediate rebalance and wait for it to finish before # starting new one for new_add_nodes in add_list: operation = self.task.async_rebalance( known_nodes, new_add_nodes, []) known_nodes.append(new_add_nodes) iter_count = iter_count + 1 # if this is last intermediate rebalance, don't wait if iter_count == len(add_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "swap_rebalance": if (step_count == -1): if self.warmup: for node in add_nodes: self.rest.add_node( self.cluster.master.rest_username, self.cluster.master.rest_password, node.ip, self.cluster.servers[self.nodes_init].port) node = known_nodes[-1] self.warmup_node(node) operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes, check_vbucket_shuffling=False) self.task.jython_task_manager.get_task_result(operation) if not operation.result: self.log.info("rebalance was failed as expected") for bucket in self.bucket_util.buckets: self.assertTrue( self.bucket_util._wait_warmup_completed( [node], bucket)) self.log.info("second attempt to rebalance") self.sleep( 60, "wait before starting rebalance after warmup") operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes) self.wait_for_rebalance_to_complete(operation) self.sleep(60) else: if self.update_replica: self.log.info( "Updating all the bucket replicas to {0}".format( self.updated_num_replicas)) self.bucket_util.update_all_bucket_replicas( self.updated_num_replicas) self.bucket_util.print_bucket_stats() for node in add_nodes: self.rest.add_node( self.cluster.master.rest_username, self.cluster.master.rest_password, node.ip, self.cluster.servers[self.nodes_init].port) operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes, check_vbucket_shuffling=False) if self.compaction: self.compact_all_buckets() if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count add_list = [] remove_list = [] for i in range(0, len(add_nodes), step_count): if i + step_count >= len(add_nodes): add_list.append(add_nodes[i:]) remove_list.append(remove_nodes[i:]) else: add_list.append(add_nodes[i:i + step_count]) remove_list.append(remove_nodes[i:i + step_count]) iter_count = 0 # start each intermediate rebalance and wait for it to finish before # starting new one for new_add_nodes, new_remove_nodes in zip( add_list, remove_list): operation = self.task.async_rebalance( known_nodes, new_add_nodes, new_remove_nodes, check_vbucket_shuffling=False) known_nodes = [ node for node in known_nodes if node not in new_remove_nodes ] known_nodes.extend(new_add_nodes) iter_count = iter_count + 1 # if this is last intermediate rebalance, don't wait if iter_count == len(add_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "rebalance_in_out": if self.warmup: for node in add_nodes: self.rest.add_node( self.cluster.master.rest_username, self.cluster.master.rest_password, node.ip, self.cluster.servers[self.nodes_init].port) node = known_nodes[-1] self.warmup_node(node) operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes) self.task.jython_task_manager.get_task_result(operation) if not operation.result: self.log.info("rebalance was failed as expected") for bucket in self.bucket_util.buckets: self.assertTrue( self.bucket_util._wait_warmup_completed([node], bucket)) self.log.info("second attempt to rebalance") self.sleep(60, "wait before starting rebalance after warmup") operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes) self.wait_for_rebalance_to_complete(operation) self.sleep(60) else: if self.update_replica: self.log.info( "Updating all the bucket replicas to {0}".format( self.updated_num_replicas)) self.bucket_util.update_all_bucket_replicas( self.updated_num_replicas) self.bucket_util.print_bucket_stats() for node in add_nodes: self.rest.add_node( self.cluster.master.rest_username, self.cluster.master.rest_password, node.ip, self.cluster.servers[self.nodes_init].port) operation = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], remove_nodes) if self.compaction: self.compact_all_buckets() if self.change_ram_quota_cluster: self.set_ram_quota_cluster() elif rebalance_operation == "graceful_failover_rebalance_out": if step_count == -1: failover_count = 0 for failover_node in failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=True, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) if self.compaction: self.compact_all_buckets() self.data_load_after_failover() operation = self.task.async_rebalance(known_nodes, [], failover_nodes) if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count failover_list = [] for i in range(0, len(failover_nodes), step_count): if i + step_count >= len(failover_nodes): failover_list.append(failover_nodes[i:]) else: failover_list.append(failover_nodes[i:i + step_count]) # For each set of step_count number of failover nodes we failover and rebalance them out iter_count = 0 for new_failover_nodes in failover_list: failover_count = 0 for failover_node in new_failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=True, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) tasks = None self.data_load_after_failover() operation = self.task.async_rebalance( known_nodes, [], new_failover_nodes) iter_count = iter_count + 1 known_nodes = [ node for node in known_nodes if node not in new_failover_nodes ] if iter_count == len(failover_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "hard_failover_rebalance_out": if step_count == -1: failover_count = 0 for failover_node in failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=False, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) if self.compaction: self.compact_all_buckets() self.data_load_after_failover() operation = self.task.async_rebalance(known_nodes, [], failover_nodes) if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count failover_list = [] for i in range(0, len(failover_nodes), step_count): if i + step_count >= len(failover_nodes): failover_list.append(failover_nodes[i:]) else: failover_list.append(failover_nodes[i:i + step_count]) # For each set of step_count number of failover nodes we failover and rebalance them out iter_count = 0 for new_failover_nodes in failover_list: failover_count = 0 for failover_node in new_failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=False, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) tasks = None self.data_load_after_failover() operation = self.task.async_rebalance( known_nodes, [], new_failover_nodes) iter_count = iter_count + 1 known_nodes = [ node for node in known_nodes if node not in new_failover_nodes ] if iter_count == len(failover_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "graceful_failover_recovery": if (step_count == -1): failover_count = 0 for failover_node in failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=True, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) self.data_load_after_failover() # Mark the failover nodes for recovery for failover_node in failover_nodes: self.rest.set_recovery_type( otpNode='ns_1@' + failover_node.ip, recoveryType=self.recovery_type) if self.compaction: self.compact_all_buckets() # Rebalance all the nodes operation = self.task.async_rebalance(known_nodes, [], []) if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count failover_list = [] for i in range(0, len(failover_nodes), step_count): if i + step_count >= len(failover_nodes): failover_list.append(failover_nodes[i:]) else: failover_list.append(failover_nodes[i:i + step_count]) # For each set of step_count number of failover nodes we failover and recover iter_count = 0 for new_failover_nodes in failover_list: failover_count = 0 for failover_node in new_failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=True, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) tasks = None self.data_load_after_failover() # Mark the failover nodes for recovery for failover_node in new_failover_nodes: self.rest.set_recovery_type( otpNode='ns_1@' + failover_node.ip, recoveryType=self.recovery_type) operation = self.task.async_rebalance(known_nodes, [], []) iter_count = iter_count + 1 if iter_count == len(failover_list): continue self.wait_for_rebalance_to_complete(operation) elif rebalance_operation == "hard_failover_recovery": if (step_count == -1): failover_count = 0 for failover_node in failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=False, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) self.data_load_after_failover() # Mark the failover nodes for recovery for failover_node in failover_nodes: self.rest.set_recovery_type( otpNode='ns_1@' + failover_node.ip, recoveryType=self.recovery_type) if self.compaction: self.compact_all_buckets() # Rebalance all the nodes operation = self.task.async_rebalance(known_nodes, [], []) if self.change_ram_quota_cluster: self.set_ram_quota_cluster() else: # list of lists each of length step_count failover_list = [] for i in range(0, len(failover_nodes), step_count): if i + step_count >= len(failover_nodes): failover_list.append(failover_nodes[i:]) else: failover_list.append(failover_nodes[i:i + step_count]) # For each set of step_count number of failover nodes we failover and recover iter_count = 0 for new_failover_nodes in failover_list: failover_count = 0 for failover_node in new_failover_nodes: failover_operation = self.task.failover( known_nodes, failover_nodes=[failover_node], graceful=False, wait_for_pending=wait_for_pending) failover_count = failover_count + 1 self.wait_for_failover_or_assert(failover_count) if tasks is not None: self.wait_for_async_data_load_to_complete(tasks) tasks = None self.data_load_after_failover() # Mark the failover nodes for recovery for failover_node in new_failover_nodes: self.rest.set_recovery_type( otpNode='ns_1@' + failover_node.ip, recoveryType=self.recovery_type) operation = self.task.async_rebalance(known_nodes, [], []) iter_count = iter_count + 1 if iter_count == len(failover_list): continue self.wait_for_rebalance_to_complete(operation) else: self.fail("rebalance_operation is not defined") return operation def subsequent_data_load(self, async_load=False, data_load_spec=None): if data_load_spec is None: data_load_spec = self.data_load_spec doc_loading_spec = self.bucket_util.get_crud_template_from_package( data_load_spec) self.over_ride_doc_loading_template_params(doc_loading_spec) self.set_retry_exceptions(doc_loading_spec) if self.dgm_test: if data_load_spec == "dgm_load": # pre-load to dgm doc_loading_spec[MetaCrudParams.DocCrud. CREATE_PERCENTAGE_PER_COLLECTION] = 2 else: # Do only deletes during dgm + rebalance op doc_loading_spec[MetaCrudParams.DocCrud. CREATE_PERCENTAGE_PER_COLLECTION] = 0 if self.forced_hard_failover and self.spec_name == "multi_bucket.buckets_for_rebalance_tests_more_collections": # create collections, else if other bucket_spec - then just "create" ops doc_loading_spec[MetaCrudParams.COLLECTIONS_TO_ADD_PER_BUCKET] = 20 tasks = self.bucket_util.run_scenario_from_spec( self.task, self.cluster, self.bucket_util.buckets, doc_loading_spec, mutation_num=0, async_load=async_load, batch_size=self.batch_size, validate_task=(not self.skip_validations)) return tasks def async_data_load(self): tasks = self.subsequent_data_load(async_load=True) return tasks def sync_data_load(self): self.subsequent_data_load() def wait_for_async_data_load_to_complete(self, task): self.task.jython_task_manager.get_task_result(task) if not self.skip_validations: self.bucket_util.validate_doc_loading_results(task) if task.result is False: self.fail("Doc_loading failed") def wait_for_compaction_to_complete(self): for task in self.compaction_tasks: self.task_manager.get_task_result(task) self.assertTrue( task.result, "Compaction failed for bucket: %s" % task.bucket.name) def wait_for_rebalance_to_complete(self, task, wait_step=120): self.task.jython_task_manager.get_task_result(task) if self.dgm_test and (not task.result): fail_flag = True for bucket in self.bucket_util.buckets: result = self.get_active_resident_threshold(bucket.name) if result < 20: fail_flag = False self.log.error("DGM less than 20") break self.assertFalse(fail_flag, "rebalance failed") else: self.assertTrue(task.result, "Rebalance Failed") if self.compaction: self.wait_for_compaction_to_complete() def data_validation_collection(self): if not self.skip_validations: if self.data_load_spec == "ttl_load" or self.data_load_spec == "ttl_load1": self.bucket_util._expiry_pager() self.sleep(400, "wait for maxttl to finish") items = 0 self.bucket_util._wait_for_stats_all_buckets() for bucket in self.bucket_util.buckets: items = items + self.bucket_helper_obj.get_active_key_count( bucket) if items != 0: self.fail("TTL + rebalance failed") elif self.forced_hard_failover: pass else: self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.validate_docs_per_collections_all_buckets() def load_collections_with_rebalance(self, rebalance_operation): tasks = None rebalance = None self.log.info("Doing collection data load {0} {1}".format( self.data_load_stage, rebalance_operation)) if self.data_load_stage == "before": if self.data_load_type == "async": tasks = self.async_data_load() else: self.sync_data_load() if self.dgm_test: self.load_to_dgm() if rebalance_operation == "rebalance_in": rebalance = self.rebalance_operation( rebalance_operation="rebalance_in", known_nodes=self.cluster.servers[:self.nodes_init], add_nodes=self.cluster. servers[self.nodes_init:self.nodes_init + self.nodes_in], tasks=tasks) elif rebalance_operation == "rebalance_out": rebalance = self.rebalance_operation( rebalance_operation="rebalance_out", known_nodes=self.cluster.servers[:self.nodes_init], remove_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_out:], tasks=tasks) elif rebalance_operation == "swap_rebalance": rebalance = self.rebalance_operation( rebalance_operation="swap_rebalance", known_nodes=self.cluster.servers[:self.nodes_init], add_nodes=self.cluster. servers[self.nodes_init:self.nodes_init + self.nodes_swap], remove_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_swap:], tasks=tasks) elif rebalance_operation == "rebalance_in_out": rebalance = self.rebalance_operation( rebalance_operation="rebalance_in_out", known_nodes=self.cluster.servers[:self.nodes_init], add_nodes=self.cluster. servers[self.nodes_init:self.nodes_init + self.nodes_in], remove_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_out:], tasks=tasks) elif rebalance_operation == "graceful_failover_rebalance_out": rebalance = self.rebalance_operation( rebalance_operation="graceful_failover_rebalance_out", known_nodes=self.cluster.servers[:self.nodes_init], failover_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_failover:], tasks=tasks) elif rebalance_operation == "hard_failover_rebalance_out": rebalance = self.rebalance_operation( rebalance_operation="hard_failover_rebalance_out", known_nodes=self.cluster.servers[:self.nodes_init], failover_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_failover:], tasks=tasks) elif rebalance_operation == "graceful_failover_recovery": rebalance = self.rebalance_operation( rebalance_operation="graceful_failover_recovery", known_nodes=self.cluster.servers[:self.nodes_init], failover_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_failover:], tasks=tasks) elif rebalance_operation == "hard_failover_recovery": rebalance = self.rebalance_operation( rebalance_operation="hard_failover_recovery", known_nodes=self.cluster.servers[:self.nodes_init], failover_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_failover:], tasks=tasks) elif rebalance_operation == "forced_hard_failover_rebalance_out": rebalance = self.forced_failover_operation( known_nodes=self.cluster.servers[:self.nodes_init], failover_nodes=self.cluster.servers[:self.nodes_init] [-self.nodes_failover:]) if self.data_load_stage == "during": # MB-40654 self.sleep(10, "wait for rebalance to start") if self.data_load_type == "async": tasks = self.async_data_load() else: self.sync_data_load() if not self.warmup: self.wait_for_rebalance_to_complete(rebalance) if self.data_load_stage == "during" or self.data_load_stage == "before": if self.data_load_type == "async": # for failover + before + async, wait_for_async_data_load_to_complete is already done if self.data_load_stage == "before" and rebalance_operation in self.failover_ops: pass else: self.wait_for_async_data_load_to_complete(tasks) self.data_validation_collection() if self.data_load_stage == "after": self.sync_data_load() self.data_validation_collection() def test_data_load_collections_with_rebalance_in(self): self.load_collections_with_rebalance( rebalance_operation="rebalance_in") def test_data_load_collections_with_rebalance_out(self): self.load_collections_with_rebalance( rebalance_operation="rebalance_out") def test_data_load_collections_with_swap_rebalance(self): self.load_collections_with_rebalance( rebalance_operation="swap_rebalance") def test_data_load_collections_with_rebalance_in_out(self): self.load_collections_with_rebalance( rebalance_operation="rebalance_in_out") def test_data_load_collections_with_graceful_failover_rebalance_out(self): self.load_collections_with_rebalance( rebalance_operation="graceful_failover_rebalance_out") def test_data_load_collections_with_hard_failover_rebalance_out(self): self.load_collections_with_rebalance( rebalance_operation="hard_failover_rebalance_out") def test_data_load_collections_with_graceful_failover_recovery(self): self.load_collections_with_rebalance( rebalance_operation="graceful_failover_recovery") def test_data_load_collections_with_hard_failover_recovery(self): self.load_collections_with_rebalance( rebalance_operation="hard_failover_recovery") def test_data_load_collections_with_forced_hard_failover_rebalance_out( self): self.load_collections_with_rebalance( rebalance_operation="forced_hard_failover_rebalance_out")
class volume(BaseTestCase): def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket": False}) BaseTestCase.setUp(self) self.rest = RestConnection(self.servers[0]) self.op_type = self.input.param("op_type", "create") self.available_servers = list() self.available_servers = self.cluster.servers[self.nodes_init:] self.num_buckets = self.input.param("num_buckets", 1) self.mutate = 0 self.doc_ops = self.input.param("doc_ops", None) if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.iterations = self.input.param("iterations", 2) self.vbucket_check = self.input.param("vbucket_check", True) self.new_num_writer_threads = self.input.param( "new_num_writer_threads", 6) self.new_num_reader_threads = self.input.param( "new_num_reader_threads", 8) self.create_perc = 100 self.update_perc = self.input.param("update_perc", 50) self.delete_perc = self.input.param("delete_perc", 50) self.expiry_perc = self.input.param("expiry_perc", 0) self.start = 0 self.end = 0 self.initial_items = self.start self.final_items = self.end self.create_end = 0 self.create_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.num_collections = self.input.param("num_collections", 10) def create_required_buckets(self): self.log.info("Get the available memory quota") self.info = self.rest.get_nodes_self() threshold_memory = 100 # threshold_memory_vagrant = 100 total_memory_in_mb = self.info.mcdMemoryReserved total_available_memory_in_mb = total_memory_in_mb # If the mentioned service is already present, # we remove that much memory from available memory quota if "index" in self.info.services: total_available_memory_in_mb -= self.info.indexMemoryQuota if "fts" in self.info.services: total_available_memory_in_mb -= self.info.ftsMemoryQuota if "cbas" in self.info.services: total_available_memory_in_mb -= self.info.cbasMemoryQuota if "eventing" in self.info.services: total_available_memory_in_mb -= self.info.eventingMemoryQuota available_memory = total_available_memory_in_mb - threshold_memory self.rest.set_service_memoryQuota(service='memoryQuota', memoryQuota=available_memory) # Creating buckets for data loading purpose self.log.info("Create CB buckets") self.bucket_expiry = self.input.param("bucket_expiry", 0) ramQuota = self.input.param("ramQuota", available_memory) buckets = self.input.param("bucket_names", "GleamBookUsers").split(';') self.bucket_type = self.bucket_type.split(';') self.compression_mode = self.compression_mode.split(';') self.bucket_eviction_policy = self.bucket_eviction_policy for i in range(self.num_buckets): bucket = Bucket({ Bucket.name: buckets[i], Bucket.ramQuotaMB: ramQuota / self.num_buckets, Bucket.maxTTL: self.bucket_expiry, Bucket.replicaNumber: self.num_replicas, Bucket.storageBackend: self.bucket_storage, Bucket.evictionPolicy: self.bucket_eviction_policy, Bucket.bucketType: self.bucket_type[i], Bucket.compressionMode: self.compression_mode[i] }) self.bucket_util.create_bucket(bucket) # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) self.rest.monitorRebalance() return bucket def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default"): for node in self.cluster_util.get_kv_nodes(): bucket_helper = BucketHelper(node) bucket_helper.update_memcached_settings( num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads) def generate_docs(self, doc_ops=None): self.gen_delete = None self.gen_create = None self.gen_update = None self.gen_expiry = None self.create_end = 0 self.create_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.initial_items = self.final_items if doc_ops is None: doc_ops = self.doc_ops if "update" in doc_ops: self.update_start = 0 self.update_end = self.num_items * self.update_perc / 100 self.mutate += 1 self.gen_update = doc_generator( "Users", self.update_start, self.update_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size, mutate=self.mutate) if "delete" in doc_ops: self.delete_start = self.start self.delete_end = self.start + (self.num_items * self.delete_perc) / 100 self.gen_delete = doc_generator( "Users", self.delete_start, self.delete_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items -= (self.delete_end - self.delete_start) * self.num_collections if "expiry" in doc_ops and self.maxttl: self.expire_start = self.start + (self.num_items * self.delete_perc) / 100 self.expire_end = self.start + self.num_items * ( self.delete_perc + self.expiry_perc) / 100 self.gen_expiry = doc_generator( "Users", self.expire_start, self.expire_end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items -= (self.expire_end - self.expire_start) * self.num_collections if "create" in doc_ops: self.start = self.end self.end += self.num_items * self.create_perc / 100 self.create_start = self.start self.create_end = self.end self.gen_create = doc_generator( "Users", self.start, self.end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=self.target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size) self.final_items += (self.end - self.start) * self.num_collections def doc_loader(self, op_type, kv_gen, exp=0, scope=None, collection=None): if scope is None: scope = CbServer.default_scope if collection is None: collection = CbServer.default_collection retry_exceptions = [ SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException ] tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, kv_gen, op_type, exp, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, scope=scope, collection=collection) return tasks_info def data_load(self, scope=CbServer.default_scope, collections=[CbServer.default_scope]): tasks_info = dict() for collection in collections: if self.gen_update is not None: task_info = self.doc_loader("update", self.gen_update, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_create is not None: task_info = self.doc_loader("create", self.gen_create, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_delete is not None: task_info = self.doc_loader("delete", self.gen_delete, scope=scope, collection=collection) tasks_info.update(task_info.items()) if self.gen_expiry is not None and self.maxttl: task_info = self.doc_loader("update", self.gen_expiry, self.maxttl, scope=scope, collection=collection) tasks_info.update(task_info.items()) return tasks_info def data_validation(self, tasks_info, scope=CbServer.default_scope, collections=[CbServer.default_scope], check_docs=True): for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions(tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) for task, task_info in tasks_info.items(): self.assertFalse( task_info["ops_failed"], "Doc ops failed for task: {}".format(task.thread_name)) if check_docs: self.log.info("Validating Active/Replica Docs") self.check_replica = False for bucket in self.bucket_util.buckets: tasks = list() for collection in collections: if self.gen_update is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_create is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_delete is not None: tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_delete, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) if self.gen_expiry is not None: self.sleep( self.maxttl, "Wait for docs to expire until expiry time..") tasks.append( self.task.async_validate_docs( self.cluster, bucket, self.gen_expiry, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, check_replica=self.check_replica, scope=scope, collection=collection)) for task in tasks: self.task.jython_task_manager.get_task_result(task) self.bucket_util._wait_for_stats_all_buckets() # self.bucket_util.verify_stats_all_buckets(self.final_items) def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) # Stopping and restarting the memcached process def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() def rebalance(self, nodes_in=0, nodes_out=0): servs_in = random.sample(self.available_servers, nodes_in) self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, nodes_out) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], servs_in, servs_out, check_vbucket_shuffling=self.vbucket_check, retry_get_process_num=150) self.available_servers = [ servs for servs in self.available_servers if servs not in servs_in ] self.available_servers += servs_out self.cluster.nodes_in_cluster.extend(servs_in) self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) return rebalance_task def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers([ "Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted", "Items Expired" ]) self.table.add_row([ str(self.initial_items), str(self.final_items), str(self.update_start) + "-" + str(self.update_end), str(self.create_start) + "-" + str(self.create_end), str(self.delete_start) + "-" + str(self.delete_end), str(self.expire_start) + "-" + str(self.expire_end) ]) self.table.display("Docs statistics") def Volume(self): ####################################################################### self.log.info("Step1: Create a n node cluster") if self.nodes_init > 1: nodes_init = self.cluster.servers[1:self.nodes_init] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) ####################################################################### self.log.info("Step 2 & 3: Create required buckets.") self.bucket = self.create_required_buckets() self.loop = 0 scope_name = "VolumeScope" collection_prefix = "VolumeCollection" self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": scope_name}) for i in range(self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection '%s::%s'" % (scope_name, collection_name)) self.bucket_util.create_collection(self.cluster.master, self.bucket, scope_name, {"name": collection_name}) self.sleep(2) ####################################################################### while self.loop < self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.bucket_util.add_rbac_user() self.generate_docs(doc_ops="create") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) for task in tasks_info: self.task.jython_task_manager.get_task_result(task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) self.create_perc = self.input.param("create_perc", 100) ################################################################### self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs(doc_ops="create") self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=0, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=2, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.stop_process() self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info( "Step 11: Failover a node and RebalanceOut that node \ with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() self.nodes = self.rest.node_statuses() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [ node for node in self.cluster.servers if node.ip == self.chosen[0].ip ] self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) self.task.jython_task_manager.get_task_result(rebalance_task) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 12: Failover a node and FullRecovery\ that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ################################################################### self.log.info("Step 13: Failover a node and DeltaRecovery that \ node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs( self.cluster.nodes_in_cluster, self.bucket_util.buckets) disk_replica_dataset, disk_active_dataset = self.bucket_util.\ get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=True) self.sleep(10) self.rest.monitorRebalance() if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.compare_failovers_logs( prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props(self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() self.set_num_writer_and_reader_threads( num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.set_num_writer_and_reader_threads( num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") self.task.jython_task_manager.get_task_result(rebalance_task) self.assertTrue(rebalance_task.result, "Rebalance Failed") self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) ####################################################################### self.log.info("Step 15: Flush the bucket and \ start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: nodes_cluster = self.cluster.nodes_in_cluster[:] nodes_cluster.remove(self.cluster.master) servs_out = random.sample( nodes_cluster, int( len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) self.task.jython_task_manager.get_task_result( rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list( set(self.cluster.nodes_in_cluster) - set(servs_out)) self.get_bucket_dgm(self.bucket) else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(self.bucket) def SteadyStateVolume(self): ####################################################################### self.log.info("Step 1: Create a n node cluster") if self.nodes_init > 1: nodes_init = self.cluster.servers[1:self.nodes_init] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) ####################################################################### self.log.info("Step 2: Create required buckets.") self.bucket = self.create_required_buckets() self.loop = 0 scope_name = "VolumeScope" collection_prefix = "VolumeCollection" self.bucket_util.create_scope(self.cluster.master, self.bucket, {"name": scope_name}) for i in range(self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection '%s::%s'" % (scope_name, collection_name)) self.bucket_util.create_collection(self.cluster.master, self.bucket, scope_name, {"name": collection_name}) self.sleep(2) ####################################################################### self.log.info("Step 3: Per-Requisites for Loading of docs") self.create_perc = 100 _iter = 0 while _iter < 2: self.generate_docs(doc_ops="create") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation(tasks_info, check_docs=False) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1 _iter = 0 self.update_perc = 100 while _iter < 10: self.generate_docs(doc_ops="update") tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1 for i in range(1, self.num_collections, 2): collection_name = collection_prefix + str(i) self.bucket_util.drop_collection(self.cluster.master, self.bucket, scope_name, collection_name) self.bucket.scopes[scope_name].collections.pop(collection_name) self.update_perc = self.input.param("update_perc", 100) self.create_perc = self.input.param("create_perc", 100) _iter = 0 while _iter < 10: self.generate_docs() tasks_info = self.data_load( scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.data_validation( tasks_info, scope=scope_name, collections=self.bucket.scopes[scope_name].collections.keys()) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(self.bucket) _iter += 1
def test_MB_40531(self): """ Test to validate, 1. Active resident ratio on the nodes never goes down below the replica_rr value 2. 'evictable' (vb_replica_itm_mem - vb_replica_meta_data_mem) value never goes below wm_threshold of total bucket memory (ep_max_size) :return: """ def check_replica_eviction(): tbl = TableView(self.log.info) tbl.set_headers([ "Node", "Memory", "WM_Threshold", "Itm_mem", "Meta_mem", "Evictable_mem", "A_rr", "R_rr" ]) while self.test_failure is None and run_eviction_check: tbl.rows = [] for kv_node in node_data.keys(): all_stats = \ node_data[kv_node]["cbstat"].all_stats(bucket.name) bucket_mem = int(all_stats["ep_max_size"]) wm_threshold = \ (float(all_stats["ep_mem_high_wat_percent"]) - float(all_stats["ep_mem_low_wat_percent"]))*100 evictable_mem = \ int(all_stats["vb_replica_itm_memory"]) \ - int(all_stats["vb_replica_meta_data_memory"]) active_rr = int(all_stats["vb_active_perc_mem_resident"]) replica_rr = int(all_stats["vb_replica_perc_mem_resident"]) tbl.add_row([ kv_node.ip, str(bucket_mem), str(wm_threshold), all_stats["vb_replica_itm_memory"], all_stats["vb_replica_meta_data_memory"], str(evictable_mem), str(active_rr), str(replica_rr) ]) if active_rr != 100 \ and evictable_mem > (bucket_mem/wm_threshold): tbl.display("Node memory stats") self.log_failure("%s - Active keys evicted before " "meeting the threshold: %s" % (kv_node.ip, all_stats)) if replica_rr > active_rr: tbl.display("Node memory stats") self.log_failure( "%s: (active_rr) %s < %s (replica_rr)" % (kv_node.ip, active_rr, replica_rr)) bucket = self.bucket_util.buckets[0] node_data = dict() kv_nodes = self.cluster_util.get_kv_nodes() for node in kv_nodes: cbstat = Cbstats(RemoteMachineShellConnection(node)) node_data[node] = dict() node_data[node]["cbstat"] = cbstat node_data[node]["active"] = cbstat.vbucket_list( bucket.name, "active") node_data[node]["replica"] = cbstat.vbucket_list( bucket.name, "replica") target_dgm = 30 run_eviction_check = True bucket_helper = BucketHelper(self.cluster.master) eviction_check_thread = Thread(target=check_replica_eviction) eviction_check_thread.start() op_index = 0 op_batch_size = 8000 create_batch_size = 10000 # Perform ADD/SET/READ until targeted DGM value is reached curr_dgm = bucket_helper.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Wait for DGM to reach %s%%. Current DGM: %s%%" % (target_dgm, curr_dgm)) while int(curr_dgm) > target_dgm and self.test_failure is None: create_gen = doc_generator(self.key, self.num_items, self.num_items + create_batch_size, key_size=self.key_size, doc_size=self.doc_size, mutation_type="ADD") update_gen = doc_generator(self.key, op_index, op_index + op_batch_size, key_size=self.key_size, doc_size=self.doc_size, mutation_type="ADD") read_gen = doc_generator(self.key, op_index, op_index + op_batch_size, key_size=self.key_size, doc_size=0) create_task = self.task.async_load_gen_docs( self.cluster, bucket, create_gen, "create", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) update_task = self.task.async_load_gen_docs( self.cluster, bucket, update_gen, "update", 0, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) read_task = self.task.async_load_gen_docs( self.cluster, bucket, read_gen, "read", timeout_secs=self.sdk_timeout, print_ops_rate=False, batch_size=200, process_concurrency=1) self.task_manager.get_task_result(create_task) self.task_manager.get_task_result(update_task) self.task_manager.get_task_result(read_task) # Update indexes for next iteration op_index += op_batch_size self.num_items += create_batch_size curr_dgm = bucket_helper.fetch_bucket_stats( bucket.name )["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Current DGM: %s%%" % curr_dgm) # Stop eviction check thread run_eviction_check = False eviction_check_thread.join() # Close shell connections for node in kv_nodes: node_data[node]["cbstat"].shellConn.disconnect() self.validate_test_failure()
class MagmaBaseTest(BaseTestCase): def setUp(self): super(MagmaBaseTest, self).setUp() self.vbuckets = self.input.param("vbuckets", self.cluster_util.vbuckets) self.rest = RestConnection(self.cluster.master) self.bucket_ram_quota = self.input.param("bucket_ram_quota", None) self.fragmentation = int(self.input.param("fragmentation", 50)) self.check_temporary_failure_exception = False self.retry_exceptions = [ SDKException.TimeoutException, SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException, SDKException.UnambiguousTimeoutException ] self.ignore_exceptions = [] # Sets autocompaction at bucket level self.autoCompactionDefined = str( self.input.param("autoCompactionDefined", "false")).lower() # Create Cluster self.rest.init_cluster(username=self.cluster.master.rest_username, password=self.cluster.master.rest_password) nodes_init = self.cluster.servers[1:self.nodes_init] self.services = ["kv"] * (self.nodes_init) self.dcp_services = self.input.param("dcp_services", None) self.dcp_servers = [] if self.dcp_services: server = self.rest.get_nodes_self() self.rest.set_service_memoryQuota( service='indexMemoryQuota', memoryQuota=int(server.mcdMemoryReserved - 100)) self.dcp_services = [ service.replace(":", ",") for service in self.dcp_services.split("-") ] self.services.extend(self.dcp_services) self.dcp_servers = self.cluster.servers[self.nodes_init:self. nodes_init + len(self.dcp_services)] nodes_in = nodes_init + self.dcp_servers result = self.task.rebalance([self.cluster.master], nodes_in, [], services=self.services[1:]) self.assertTrue(result, "Initial rebalance failed") self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_in) for idx, node in enumerate(self.cluster.nodes_in_cluster): node.services = self.services[idx] # Create Buckets self.bucket_storage = self.input.param("bucket_storage", Bucket.StorageBackend.magma) self.bucket_eviction_policy = self.input.param( "bucket_eviction_policy", Bucket.EvictionPolicy.FULL_EVICTION) self.bucket_util.add_rbac_user() self.bucket_name = self.input.param("bucket_name", None) self.magma_buckets = self.input.param("magma_buckets", 0) if self.standard_buckets > 10: self.bucket_util.change_max_buckets(self.standard_buckets) if self.standard_buckets == 1: self._create_default_bucket() else: self._create_multiple_buckets() self.buckets = self.bucket_util.buckets # sel.num_collections=1 signifies only default collection self.num_collections = self.input.param("num_collections", 1) self.num_scopes = self.input.param("num_scopes", 1) self.scope_name = CbServer.default_scope # Creation of scopes of num_scopes is > 1 scope_prefix = "Scope" for bucket in self.bucket_util.buckets: for i in range(1, self.num_scopes): scope_name = scope_prefix + str(i) self.log.info("Creating bucket::scope {} {}\ ".format(bucket.name, scope_name)) self.bucket_util.create_scope(self.cluster.master, bucket, {"name": scope_name}) self.sleep(2) self.scopes = self.buckets[0].scopes.keys() self.log.info("Scopes list is {}".format(self.scopes)) collection_prefix = "FunctionCollection" # Creation of collection of num_collections is > 1 for bucket in self.bucket_util.buckets: for scope_name in self.scopes: for i in range(1, self.num_collections): collection_name = collection_prefix + str(i) self.log.info("Creating scope::collection {} {}\ ".format(scope_name, collection_name)) self.bucket_util.create_collection( self.cluster.master, bucket, scope_name, {"name": collection_name}) self.sleep(2) self.collections = self.buckets[0].scopes[ self.scope_name].collections.keys() self.log.debug("Collections list == {}".format(self.collections)) if self.dcp_services and self.num_collections == 1: self.initial_idx = "initial_idx" self.initial_idx_q = "CREATE INDEX %s on default:`%s`.`%s`.`%s`(meta().id) with \ {\"defer_build\": false};" % ( self.initial_idx, self.buckets[0].name, self.scope_name, self.collections[0]) self.query_client = RestConnection(self.dcp_servers[0]) result = self.query_client.query_tool(self.initial_idx_q) self.assertTrue(result["status"] == "success", "Index query failed!") # Update Magma/Storage Properties props = "magma" update_bucket_props = False self.disable_magma_commit_points = self.input.param( "disable_magma_commit_points", False) self.max_commit_points = self.input.param("max_commit_points", None) if self.disable_magma_commit_points: self.max_commit_points = 0 if self.max_commit_points is not None: props += ";magma_max_checkpoints={}".format(self.max_commit_points) self.log.debug("props== {}".format(props)) update_bucket_props = True if update_bucket_props: self.bucket_util.update_bucket_props("backend", props, self.bucket_util.buckets) # Monitor Stats Params self.ep_queue_stats = self.input.param("ep_queue_stats", True) self.monitor_stats = ["doc_ops", "ep_queue_size"] if not self.ep_queue_stats: self.monitor_stats = ["doc_ops"] #Disk usage before data load self.disk_usage_before_loading = self.get_disk_usage( self.buckets[0], self.cluster.nodes_in_cluster)[0] self.log.info("disk usage before loading {}".format( self.disk_usage_before_loading)) # Doc controlling params self.key = 'test_docs' if self.random_key: self.key = "random_keys" self.doc_ops = self.input.param("doc_ops", "create") self.key_size = self.input.param("key_size", 8) self.doc_size = self.input.param("doc_size", 2048) self.gen_create = None self.gen_delete = None self.gen_read = None self.gen_update = None self.create_perc = self.input.param("update_perc", 100) self.update_perc = self.input.param("update_perc", 0) self.delete_perc = self.input.param("delete_perc", 0) self.expiry_perc = self.input.param("expiry_perc", 0) self.start = 0 self.end = 0 self.create_start = None self.create_end = None self.update_start = None self.update_end = None self.delete_start = None self.delete_end = None self.read_start = None self.read_end = None self.expiry_start = None self.expiry_end = None self.mutate = 0 self.init_items_per_collection = self.num_items self.maxttl = self.input.param("maxttl", 10) # Common test params self.test_itr = self.input.param("test_itr", 4) self.update_itr = self.input.param("update_itr", 2) self.next_half = self.input.param("next_half", False) self.deep_copy = self.input.param("deep_copy", False) if self.active_resident_threshold < 100: self.check_temporary_failure_exception = True # self.thread_count is used to define number of thread use # to read same number of documents parallelly self.read_thread_count = self.input.param("read_thread_count", 4) self.disk_usage = dict() # Initial Data Load self.initial_load() self.log.info("==========Finished magma base setup========") def initial_load(self): self.create_start = 0 self.create_end = self.init_items_per_collection if self.rev_write: self.create_start = -int(self.init_items_per_collection - 1) self.create_end = 1 self.generate_docs(doc_ops="create") self.init_loading = self.input.param("init_loading", True) self.dgm_batch = self.input.param("dgm_batch", 5000) if self.init_loading: self.log.debug("initial_items_in_each_collection {}".format( self.init_items_per_collection)) tasks_info = dict() for collection in self.collections: self.generate_docs(doc_ops="create", target_vbucket=None) tem_tasks_info = self.loadgen_docs(self.retry_exceptions, self.ignore_exceptions, scope=self.scope_name, collection=collection, _sync=False, doc_ops="create") tasks_info.update(tem_tasks_info.items()) for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions( tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) self.bucket_util._wait_for_stats_all_buckets(timeout=3600) if self.standard_buckets == 1 or self.standard_buckets == self.magma_buckets: for bucket in self.bucket_util.get_all_buckets(): disk_usage = self.get_disk_usage( bucket, self.cluster.nodes_in_cluster) self.disk_usage[bucket.name] = disk_usage[0] self.log.info( "For bucket {} disk usage after initial creation is {}MB\ ".format(bucket.name, self.disk_usage[bucket.name])) self.num_items = self.init_items_per_collection * self.num_collections self.read_start = 0 self.read_end = self.init_items_per_collection def _create_default_bucket(self): self.bucket_util.create_default_bucket( bucket_type=self.bucket_type, ram_quota=self.bucket_ram_quota, replica=self.num_replicas, storage=self.bucket_storage, eviction_policy=self.bucket_eviction_policy, autoCompactionDefined=self.autoCompactionDefined, fragmentation_percentage=self.fragmentation) def _create_multiple_buckets(self): buckets_created = self.bucket_util.create_multiple_buckets( self.cluster.master, self.num_replicas, bucket_count=self.standard_buckets, bucket_type=self.bucket_type, storage={ "couchstore": self.standard_buckets - self.magma_buckets, "magma": self.magma_buckets }, eviction_policy=self.bucket_eviction_policy, bucket_name=self.bucket_name, fragmentation_percentage=self.fragmentation) self.assertTrue(buckets_created, "Unable to create multiple buckets") for bucket in self.bucket_util.buckets: ready = self.bucket_util.wait_for_memcached( self.cluster.master, bucket) self.assertTrue(ready, msg="Wait_for_memcached failed") def tearDown(self): self.cluster_util.print_cluster_stats() dgm = None timeout = 65 while dgm is None and timeout > 0: try: stats = BucketHelper(self.cluster.master).fetch_bucket_stats( self.buckets[0].name) dgm = stats["op"]["samples"]["vb_active_resident_items_ratio"][ -1] except: self.log.debug( "Fetching vb_active_resident_items_ratio(dgm) failed...retying" ) timeout -= 1 time.sleep(1) self.log.info("## Active Resident Threshold of {0} is {1} ##".format( self.buckets[0].name, dgm)) super(MagmaBaseTest, self).tearDown() def run_compaction(self, compaction_iterations=5): for _ in range(compaction_iterations): compaction_tasks = list() for bucket in self.bucket_util.buckets: compaction_tasks.append( self.task.async_compact_bucket(self.cluster.master, bucket)) for task in compaction_tasks: self.task_manager.get_task_result(task) def validate_seq_itr(self): if self.dcp_services and self.num_collections == 1: index_build_q = "SELECT state FROM system:indexes WHERE name='{}';" start = time.time() result = False while start + 300 > time.time(): result = self.query_client.query_tool(index_build_q.format( self.initial_idx), timeout=60) if result["results"][0]["state"] == "online": result = True break self.sleep(5) self.assertTrue(result, "initial_idx Index warmup failed") self.final_idx = "final_idx" self.final_idx_q = "CREATE INDEX %s on default:`%s`.`%s`.`%s`(body) with \ {\"defer_build\": false};" % ( self.final_idx, self.buckets[0].name, self.scope_name, self.collections[0]) result = self.query_client.query_tool(self.final_idx_q, timeout=3600) start = time.time() if result["status"] != "success": while start + 300 > time.time(): result = self.query_client.query_tool(index_build_q.format( self.final_idx), timeout=60) if result["results"][0]["state"] == "online": result = True break self.sleep(5) self.assertTrue(result, "final_idx Index warmup failed") else: self.assertTrue(result["status"] == "success", "Index query failed!") self.sleep(5) self.initial_count_q = "Select count(*) as items "\ "from default:`{}`.`{}`.`{}` where meta().id like '%%';".format( self.buckets[0].name, self.scope_name, self.collections[0]) self.final_count_q = "Select count(*) as items "\ "from default:`{}`.`{}`.`{}` where body like '%%';".format( self.buckets[0].name, self.scope_name, self.collections[0]) self.log.info(self.initial_count_q) self.log.info(self.final_count_q) initial_count, final_count = 0, 0 kv_items = self.bucket_util.get_bucket_current_item_count( self.cluster, self.buckets[0]) start = time.time() while start + 300 > time.time(): kv_items = self.bucket_util.get_bucket_current_item_count( self.cluster, self.buckets[0]) self.log.info("Items in KV: %s" % kv_items) initial_count = self.query_client.query_tool( self.initial_count_q)["results"][0]["items"] self.log.info("## Initial Index item count in %s:%s:%s == %s" % (self.buckets[0].name, self.scope_name, self.collections[0], initial_count)) final_count = self.query_client.query_tool( self.final_count_q)["results"][0]["items"] self.log.info("## Final Index item count in %s:%s:%s == %s" % (self.buckets[0].name, self.scope_name, self.collections[0], final_count)) if initial_count != kv_items or final_count != kv_items: self.sleep(5) continue break self.assertTrue( initial_count == kv_items, "Indexer failed. KV:{}, Initial:{}".format( kv_items, initial_count)) self.assertTrue( final_count == kv_items, "Indexer failed. KV:{}, Final:{}".format( kv_items, final_count)) def genrate_docs_basic(self, start, end, target_vbucket=None, mutate=0): return doc_generator(self.key, start, end, doc_size=self.doc_size, doc_type=self.doc_type, target_vbucket=target_vbucket, vbuckets=self.cluster_util.vbuckets, key_size=self.key_size, randomize_doc_size=self.randomize_doc_size, randomize_value=self.randomize_value, mix_key_size=self.mix_key_size, mutate=mutate, deep_copy=self.deep_copy) def generate_docs(self, doc_ops=None, target_vbucket=None, create_end=None, create_start=None, create_mutate=0, update_end=None, update_start=None, update_mutate=0, read_end=None, read_start=None, read_mutate=0, delete_end=None, delete_start=None, expiry_end=None, expiry_start=None, expiry_mutate=0): doc_ops = doc_ops or self.doc_ops if "update" in doc_ops: if update_start is not None: self.update_start = update_start if update_end is not None: self.update_end = update_end if self.update_start is None: self.update_start = self.start if self.update_end is None: self.update_end = self.end * self.update_perc / 100 self.mutate += 1 self.gen_update = self.genrate_docs_basic( self.update_start, self.update_end, target_vbucket=target_vbucket, mutate=self.mutate) if "delete" in doc_ops: if delete_start is not None: self.delete_start = delete_start if delete_end is not None: self.delete_end = delete_end if self.delete_start is None: self.delete_start = self.start if self.delete_end is None: self.delete_end = self.end * self.delete_perc / 100 self.gen_delete = self.genrate_docs_basic( self.delete_start, self.delete_end, target_vbucket=target_vbucket, mutate=read_mutate) if "create" in doc_ops: if create_start is not None: self.create_start = create_start if self.create_start is None: self.create_start = self.end self.start = self.create_start if create_end is not None: self.create_end = create_end if self.create_end is None: self.create_end = self.start + self.num_items * self.create_perc / 100 self.end = self.create_end self.gen_create = self.genrate_docs_basic( self.create_start, self.create_end, target_vbucket=target_vbucket, mutate=create_mutate) if "read" in doc_ops: if read_start is not None: self.read_start = read_start if read_end is not None: self.read_end = read_end if self.read_start is None: self.read_start = self.create_start if self.read_end is None: self.read_end = self.create_end self.gen_read = self.genrate_docs_basic( self.read_start, self.read_end, target_vbucket=target_vbucket, mutate=read_mutate) if "expiry" in doc_ops: if expiry_start is not None: self.expiry_start = expiry_start elif self.expiry_start is None: self.expiry_start = self.start + (self.num_items * self.delete_perc) / 100 if expiry_end is not None: self.expiry_end = expiry_end elif self.expiry_end is None: self.expiry_end = self.start+self.num_items *\ (self.delete_perc + self.expiry_perc)/100 self.gen_expiry = self.genrate_docs_basic( self.expiry_start, self.expiry_end, target_vbucket=target_vbucket, mutate=expiry_mutate) def _load_all_buckets(self, cluster, kv_gen, op_type, exp, flag=0, only_store_hash=True, batch_size=1000, pause_secs=1, timeout_secs=30, compression=True, dgm_batch=5000, skip_read_on_error=False, suppress_error_table=False, track_failures=True): retry_exceptions = self.retry_exceptions tasks_info = self.bucket_util.sync_load_all_buckets( cluster, kv_gen, op_type, exp, flag, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, timeout_secs=timeout_secs, only_store_hash=only_store_hash, batch_size=batch_size, pause_secs=pause_secs, sdk_compression=compression, process_concurrency=self.process_concurrency, retry_exceptions=retry_exceptions, active_resident_threshold=self.active_resident_threshold, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, dgm_batch=dgm_batch, monitor_stats=self.monitor_stats, track_failures=track_failures) if self.active_resident_threshold < 100: for task, _ in tasks_info.items(): self.num_items = task.doc_index self.assertTrue(self.bucket_util.doc_ops_tasks_status(tasks_info), "Doc_ops failed in MagmaBase._load_all_buckets") return tasks_info def loadgen_docs(self, retry_exceptions=[], ignore_exceptions=[], skip_read_on_error=False, suppress_error_table=False, scope=CbServer.default_scope, collection=CbServer.default_collection, _sync=True, track_failures=True, doc_ops=None): doc_ops = doc_ops or self.doc_ops tasks_info = dict() read_tasks_info = dict() read_task = False if self.check_temporary_failure_exception: retry_exceptions.append(SDKException.TemporaryFailureException) if "update" in doc_ops and self.gen_update is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_update, "update", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures) tasks_info.update(tem_tasks_info.items()) if "create" in doc_ops and self.gen_create is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_create, "create", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures) tasks_info.update(tem_tasks_info.items()) self.num_items += (self.gen_create.end - self.gen_create.start) if "expiry" in doc_ops and self.gen_expiry is not None and self.maxttl: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_expiry, "update", self.maxttl, self.random_exp, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures) tasks_info.update(tem_tasks_info.items()) self.num_items -= (self.gen_expiry.end - self.gen_expiry.start) if "read" in doc_ops and self.gen_read is not None: read_tasks_info = self.bucket_util._async_validate_docs( self.cluster, self.gen_read, "read", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, scope=scope, collection=collection) read_task = True if "delete" in doc_ops and self.gen_delete is not None: tem_tasks_info = self.bucket_util._async_load_all_buckets( self.cluster, self.gen_delete, "delete", 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, retry_exceptions=retry_exceptions, ignore_exceptions=ignore_exceptions, skip_read_on_error=skip_read_on_error, suppress_error_table=suppress_error_table, scope=scope, collection=collection, monitor_stats=self.monitor_stats, track_failures=track_failures) tasks_info.update(tem_tasks_info.items()) self.num_items -= (self.gen_delete.end - self.gen_delete.start) if _sync: for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions( tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) if read_task: # TODO: Need to converge read_tasks_info into tasks_info before # itself to avoid confusions during _sync=False case tasks_info.update(read_tasks_info.items()) if _sync: for task in read_tasks_info: self.task_manager.get_task_result(task) return tasks_info def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) count = 0 dgm = 100 while count < 5: try: dgm = self.rest_client.fetch_bucket_stats( bucket.name )["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) return dgm except Exception as e: self.sleep(5, e) count += 1 return dgm def get_magma_stats(self, bucket, servers=None, field_to_grep=None): magma_stats_for_all_servers = dict() servers = servers or self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] for server in servers: result = dict() shell = RemoteMachineShellConnection(server) cbstat_obj = Cbstats(shell) result = cbstat_obj.magma_stats(bucket.name, field_to_grep=field_to_grep) shell.disconnect() magma_stats_for_all_servers[server.ip] = result return magma_stats_for_all_servers def get_disk_usage(self, bucket, servers=None): disk_usage = [] if servers is None: servers = self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] kvstore = 0 wal = 0 keyTree = 0 seqTree = 0 for server in servers: shell = RemoteMachineShellConnection(server) kvstore += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join( RestConnection(server).get_data_path(), bucket.name, "magma.*/kv*"))[0][0].split('\n')[0]) wal += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join( RestConnection(server).get_data_path(), bucket.name, "magma.*/wal"))[0][0].split('\n')[0]) keyTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join( RestConnection(server).get_data_path(), bucket.name, "magma.*/kv*/rev*/key*"))[0][0].split('\n')[0]) seqTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join( RestConnection(server).get_data_path(), bucket.name, "magma.*/kv*/rev*/seq*"))[0][0].split('\n')[0]) shell.disconnect() self.log.info("Disk usage stats for bucekt {} is below".format( bucket.name)) self.log.info("Total Disk usage for kvstore is {}MB".format(kvstore)) self.get_bucket_dgm(bucket) self.log.debug("Total Disk usage for wal is {}MB".format(wal)) self.log.debug("Total Disk usage for keyTree is {}MB".format(keyTree)) self.log.debug("Total Disk usage for seqTree is {}MB".format(seqTree)) disk_usage.extend([kvstore, wal, keyTree, seqTree]) return disk_usage def change_swap_space(self, servers=None, disable=True): servers = servers or self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] for server in servers: shell = RemoteMachineShellConnection(server) if disable: _ = shell.execute_command("swapoff -a") self.sleep(5) output = shell.execute_command( "free | tail -1 | awk '{print $2}'")[0][0].split('\n')[0] self.assertEqual( int(output), 0, msg= "Failed to disable swap space on server {} having value {} \ ".format(server, output)) else: _ = shell.execute_command("swapon -a") self.sleep(5) output = shell.execute_command( "free | tail -1 | awk '{print $2}'")[0][0].split('\n')[0] self.assertNotEqual( int(output), 0, msg= "Failed to enable swap space on server {} having value {} \ ".format(server, output)) return def check_fragmentation_using_magma_stats(self, bucket, servers=None): result = dict() time_end = time.time() + 60 * 5 if servers is None: servers = self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] while time.time() < time_end: stats = list() for server in servers: fragmentation_values = list() shell = RemoteMachineShellConnection(server) output = shell.execute_command( "lscpu | grep 'CPU(s)' | head -1 | awk '{print $2}'" )[0][0].split('\n')[0] self.log.debug("machine: {} - core(s): {}\ ".format(server.ip, output)) for i in range(min(int(output), 64)): grep_field = "rw_{}:magma".format(i) _res = self.get_magma_stats(bucket, [server], field_to_grep=grep_field) fragmentation_values.append( float(_res[server.ip][grep_field]["Fragmentation"])) stats.append(_res) result.update({server.ip: fragmentation_values}) res = list() for value in result.values(): res.append(max(value)) if max(res) < float(self.fragmentation) / 100: self.log.info("magma stats fragmentation result {} \ ".format(result)) return True self.log.info("magma stats fragmentation result {} \ ".format(result)) self.log.info(stats) return False def check_fragmentation_using_bucket_stats(self, bucket, servers=None): # Disabling the check for time being #return True result = dict() if servers is None: servers = self.cluster.nodes_in_cluster if type(servers) is not list: servers = [servers] time_end = time.time() + 60 * 5 while time.time() < time_end: for server in servers: frag_val = self.bucket_util.get_fragmentation_kv( bucket, server) self.log.debug("Current Fragmentation for node {} is {} \ ".format(server.ip, frag_val)) result.update({server.ip: frag_val}) if max(result.values()) < self.fragmentation: self.log.info( "KV stats fragmentation values {}".format(result)) return True self.log.info("KV stats fragmentation values {}".format(result)) return False def get_fragmentation_upsert_docs_list(self): """ This function gives the list of "number of docs" need to be updated to touch the given fragmentation value """ update_doc_count = int( math.ceil( float(self.fragmentation * self.num_items) / (100 - self.fragmentation))) upsert_doc_list = list() while update_doc_count > self.num_items: upsert_doc_list.append(self.num_items) update_doc_count -= self.num_items if update_doc_count > 0: upsert_doc_list.append(update_doc_count) self.log.info("Upsert list {}".format(upsert_doc_list)) return upsert_doc_list def validate_data(self, op_type, kv_gen, _sync=True): self.log.info("Validating Docs") validate_tasks_info = dict() for collection in self.collections: temp_tasks_info = self.bucket_util._async_validate_docs( self.cluster, kv_gen, op_type, 0, batch_size=self.batch_size, process_concurrency=self.process_concurrency, pause_secs=5, timeout_secs=self.sdk_timeout, scope=self.scope_name, collection=collection, retry_exceptions=self.retry_exceptions, ignore_exceptions=self.ignore_exceptions) validate_tasks_info.update(temp_tasks_info.items()) if _sync: for task in validate_tasks_info: self.task_manager.get_task_result(task) else: return validate_tasks_info def sigkill_memcached(self, nodes=None, graceful=False): nodes = nodes or self.cluster.nodes_in_cluster for node in nodes: shell = RemoteMachineShellConnection(node) if graceful: shell.restart_couchbase() else: shell.kill_memcached() shell.disconnect() self.assertTrue( self.bucket_util._wait_warmup_completed( [self.cluster_util.cluster.master], self.bucket_util.buckets[0], wait_time=self.wait_timeout * 20)) def crash(self, nodes=None, kill_itr=1, graceful=False, wait=True, force_collect=False): self.stop_crash = False count = kill_itr loop_itr = 0 nodes = nodes or self.cluster.nodes_in_cluster connections = dict() for node in nodes: shell = RemoteMachineShellConnection(node) connections.update({node: shell}) while not self.stop_crash: loop_itr += 1 sleep = random.randint(30, 60) self.sleep( sleep, "Iteration:{} waiting for {} sec to kill memcached on all nodes" .format(loop_itr, sleep)) for node, shell in connections.items(): if "kv" in node.services: if graceful: shell.restart_couchbase() else: while count > 0: shell.kill_memcached() self.sleep( 3, "Sleep before killing memcached on same node again." ) count -= 1 count = kill_itr result = self.check_coredump_exist(self.cluster.nodes_in_cluster, force_collect=force_collect) if result: self.stop_crash = True self.task.jython_task_manager.abort_all_tasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages " "found in cb_logs") if wait: for node in nodes: if "kv" in node.services: result = self.bucket_util._wait_warmup_completed( [node], self.bucket_util.buckets[0], wait_time=self.wait_timeout * 5) if not result: self.stop_crash = True self.task.jython_task_manager.abort_all_tasks() self.assertFalse(result) for _, shell in connections.items(): shell.disconnect() def get_state_files(self, bucket, server=None): if server is None: server = self.cluster_util.cluster.master shell = RemoteMachineShellConnection(server) magma_path = os.path.join( RestConnection(server).get_data_path(), bucket.name, "magma.0") kv_path = shell.execute_command("ls %s | grep kv | head -1" % magma_path)[0][0].split('\n')[0] path = os.path.join(magma_path, kv_path, "rev*/seqIndex") self.log.debug("SeqIndex path = {}".format(path)) output = shell.execute_command("ls %s | grep state" % path)[0] self.log.debug("State files = {}".format(output)) shell.disconnect() return output def get_tombstone_count_key(self, servers=[]): result = 0 for server in servers: data_path = RestConnection(server).get_data_path() bucket = self.bucket_util.buckets[0] magma_path = os.path.join(data_path, bucket.name, "magma.{}") shell = RemoteMachineShellConnection(server) shards = shell.execute_command( "lscpu | grep 'CPU(s)' | head -1 | awk '{print $2}'" )[0][0].split('\n')[0] self.log.debug("machine: {} - core(s): {}".format( server.ip, shards)) for shard in range(min(int(shards), 64)): magma = magma_path.format(shard) kvstores, _ = shell.execute_command( "ls {} | grep kvstore".format(magma)) cmd = '/opt/couchbase/bin/magma_dump {}'.format(magma) for kvstore in kvstores: dump = cmd kvstore_num = kvstore.split("-")[1].strip() dump += ' --kvstore {} --tree key --treedata | grep Key |grep \'"deleted":true\' | wc -l'.format( kvstore_num) result += int(shell.execute_command(dump)[0][0].strip()) return result def get_tombstone_count_seq(self, server=None, shard=0, kvstore=0): cmd = '/opt/couchbase/bin/magma_dump /data/kv/default/magma.{}/ \ --kvstore {} --tree key --treedata | grep Seq| wc -l'.format( shard, kvstore) shell = RemoteMachineShellConnection(server) result = shell.execute_command(cmd)[0] return result def get_level_data_range(self, server=None, tree="key", shard=0, kvstore=0): cmd = '/opt/couchbase/bin/magma_dump /data/kv/default/magma.{}/ \ --kvstore {} --tree {}'.format(shard, kvstore, tree) shell = RemoteMachineShellConnection(server) result = shell.execute_command(cmd)[0] return result def set_metadata_purge_interval(self, value, buckets=[], node=None): self.log.info( "Changing the bucket properties by changing {0} to {1}".format( "purge_interval", value)) if not buckets: buckets = self.buckets if node is None: node = self.cluster.master rest = RestConnection(node) shell = RemoteMachineShellConnection(node) shell.enable_diag_eval_on_non_local_hosts() shell.disconnect() for bucket in buckets: cmd = '{ok, BC} = ns_bucket:get_bucket(' \ '"%s"), BC2 = lists:keyreplace(purge_interval, ' \ '1, BC, {purge_interval, %f})' \ ', ns_bucket:set_bucket_config("%s", BC2).' \ % (bucket.name, value, bucket.name) rest.diag_eval(cmd) # Restart Memcached in all cluster nodes to reflect the settings for server in self.cluster_util.get_kv_nodes(master=node): shell = RemoteMachineShellConnection(server) shell.restart_couchbase() shell.disconnect() # Check bucket-warm_up after Couchbase restart retry_count = 10 buckets_warmed_up = self.bucket_util.is_warmup_complete( buckets, retry_count) if not buckets_warmed_up: self.log.critical("Few bucket(s) not warmed up " "within expected time")
class volume(BaseTestCase): # will add the __init__ functions after the test has been stabilised def setUp(self): self.input = TestInputSingleton.input self.input.test_params.update({"default_bucket":False}) BaseTestCase.setUp(self) self.rest = RestConnection(self.servers[0]) self.op_type = self.input.param("op_type", "create") self.tasks = [] # To have all tasks running in parallel. self._iter_count = 0 # To keep a check of how many items are deleted self.available_servers = list() self.available_servers = self.cluster.servers[self.nodes_init:] self.num_buckets = self.input.param("num_buckets", 1) self.mutate = 0 self.doc_ops = self.input.param("doc_ops", None) if self.doc_ops: self.doc_ops = self.doc_ops.split(';') self.iterations = self.input.param("iterations", 2) self.vbucket_check = self.input.param("vbucket_check", True) self.new_num_writer_threads = self.input.param( "new_num_writer_threads", 6) self.new_num_reader_threads = self.input.param( "new_num_reader_threads", 8) def create_required_buckets(self): self.log.info("Get the available memory quota") self.info = self.rest.get_nodes_self() threshold_memory = 100 # threshold_memory_vagrant = 100 total_memory_in_mb = self.info.mcdMemoryReserved total_available_memory_in_mb = total_memory_in_mb active_service = self.info.services # If the mentioned service is already present, # we remove that much memory from available memory quota if "index" in active_service: total_available_memory_in_mb -= self.info.indexMemoryQuota if "fts" in active_service: total_available_memory_in_mb -= self.info.ftsMemoryQuota if "cbas" in active_service: total_available_memory_in_mb -= self.info.cbasMemoryQuota if "eventing" in active_service: total_available_memory_in_mb -= self.info.eventingMemoryQuota available_memory = total_available_memory_in_mb - threshold_memory # available_memory = total_available_memory_in_mb - threshold_memory_vagrant self.rest.set_service_memoryQuota(service='memoryQuota', memoryQuota=available_memory) # Creating buckets for data loading purpose self.log.info("Create CB buckets") duration = self.input.param("bucket_expiry", 0) eviction_policy = self.input.param("eviction_policy", Bucket.EvictionPolicy.VALUE_ONLY) self.bucket_type = self.input.param("bucket_type", Bucket.Type.MEMBASE) # Bucket.bucket_type.EPHEMERAL compression_mode = self.input.param("compression_mode", Bucket.CompressionMode.PASSIVE) # Bucket.bucket_compression_mode.ACTIVE ramQuota = self.input.param("ramQuota", available_memory) bucket_names = self.input.param("bucket_names", "GleamBookUsers") if bucket_names: bucket_names = bucket_names.split(';') if self.bucket_type: self.bucket_type = self.bucket_type.split(';') if compression_mode: compression_mode = compression_mode.split(';') if eviction_policy: eviction_policy = eviction_policy.split(';') if self.num_buckets == 1: bucket = Bucket({"name": "GleamBookUsers", "ramQuotaMB": ramQuota, "maxTTL": duration, "replicaNumber":self.num_replicas, "evictionPolicy": eviction_policy[0], "bucketType":self.bucket_type[0], "compressionMode":compression_mode[0]}) self.bucket_util.create_bucket(bucket) elif 1 < self.num_buckets == len(bucket_names): for i in range(self.num_buckets): bucket = Bucket({"name": bucket_names[i], "ramQuotaMB": ramQuota/self.num_buckets, "maxTTL": duration, "replicaNumber":self.num_replicas, "evictionPolicy": eviction_policy[i], "bucketType":self.bucket_type[i], "compressionMode":compression_mode[i]}) self.bucket_util.create_bucket(bucket) else: self.fail("Number of bucket/Names not sufficient") # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = self.rest.node_statuses() self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) self.rest.monitorRebalance() return bucket def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default"): for node in self.cluster_util.get_kv_nodes(): bucket_helper = BucketHelper(node) bucket_helper.update_memcached_settings(num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads) def volume_doc_generator_users(self, key, start, end): template = '{{ "id":"{0}", "alias":"{1}", "name":"{2}", "user_since":"{3}", "employment":{4} }}' return GleamBookUsersDocumentGenerator(key, template, start=start, end=end) def volume_doc_generator_messages(self, key, start, end): template = '{{ "message_id": "{0}", "author_id": "{1}", "send_time": "{2}" }}' return GleamBookMessagesDocumentGenerator(key, template, start=start, end=end) def initial_data_load(self, initial_load): if self.atomicity: task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets, initial_load, "create" , exp=0, batch_size=10, process_concurrency=self.process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries,update_count=self.mutate, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit,durability=self.durability_level,sync=self.sync) self.task.jython_task_manager.get_task_result(task) else: tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, initial_load, "create", exp=0, persist_to = self.persist_to, replicate_to=self.replicate_to, batch_size= 10, pause_secs = 5, timeout_secs=30, durability=self.durability_level, process_concurrency = self.process_concurrency, retries=self.sdk_retries) for task, task_info in tasks_info.items(): self.task_manager.get_task_result(task) self.sleep(10) # Loading documents in 2 buckets in parallel through transactions def doc_load_using_txns(self): if "update" in self.doc_ops and self.gen_update_users is not None: self.tasks.append(self.doc_loader_txn("update", self.gen_update_users)) if "create" in self.doc_ops and self.gen_create_users is not None: self.tasks.append(self.doc_loader_txn("create", self.gen_create_users)) if "delete" in self.doc_ops and self.gen_delete_users is not None: self.tasks.append(self.doc_loader_txn("delete", self.gen_delete_users)) self.sleep(20) for task in self.tasks: self.task.jython_task_manager.get_task_result(task) def doc_loader_txn(self, op_type, kv_gen): if op_type == "update": print("Value of Mutated is", self.mutate) self.sleep(5) process_concurrency = self.process_concurrency # if op_type == "update": # if "create" not in self.doc_ops: # self.create_perc = 0 # if "delete" not in self.doc_ops: # self.delete_perc = 0 # process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) # if op_type == "create": # if "update" not in self.doc_ops: # self.update_perc = 0 # if "delete" not in self.doc_ops: # self.delete_perc = 0 # process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) # if op_type == "delete": # if "create" not in self.doc_ops: # self.create_perc = 0 # if "update" not in self.doc_ops: # self.update_perc = 0 # process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) task = self.task.async_load_gen_docs_atomicity(self.cluster, self.bucket_util.buckets, kv_gen, op_type, exp=0, batch_size=10, process_concurrency=process_concurrency, replicate_to=self.replicate_to, persist_to=self.persist_to, timeout_secs=self.sdk_timeout, retries=self.sdk_retries, update_count=self.mutate, transaction_timeout=self.transaction_timeout, commit=self.transaction_commit, durability=self.durability_level, sync=self.sync, defer=self.defer) return task # Loading documents through normal doc loader def normal_doc_loader(self): tasks_info = dict() if "update" in self.doc_ops and self.gen_update_users is not None: task_info = self.doc_loader("update", self.gen_update_users) tasks_info.update(task_info.items()) if "create" in self.doc_ops and self.gen_create_users is not None: task_info = self.doc_loader("create", self.gen_create_users) tasks_info.update(task_info.items()) if "delete" in self.doc_ops and self.gen_delete_users is not None: task_info = self.doc_loader("delete", self.gen_delete_users) tasks_info.update(task_info.items()) return tasks_info def doc_loader(self, op_type, kv_gen): process_concurrency = self.process_concurrency if op_type == "update": if "create" not in self.doc_ops: self.create_perc = 0 if "delete" not in self.doc_ops: self.delete_perc = 0 process_concurrency = (self.update_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) if op_type == "create": if "update" not in self.doc_ops: self.update_perc = 0 if "delete" not in self.doc_ops: self.delete_perc = 0 process_concurrency = (self.create_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) if op_type == "delete": if "create" not in self.doc_ops: self.create_perc = 0 if "update" not in self.doc_ops: self.update_perc = 0 process_concurrency = (self.delete_perc*process_concurrency)/(self.create_perc + self.delete_perc + self.update_perc) retry_exceptions = [ SDKException.AmbiguousTimeoutException, SDKException.RequestCanceledException, SDKException.DurabilityAmbiguousException, SDKException.DurabilityImpossibleException, ] tasks_info = self.bucket_util._async_load_all_buckets(self.cluster, kv_gen, op_type, 0, batch_size=20, persist_to=self.persist_to, replicate_to=self.replicate_to, durability=self.durability_level, pause_secs=5, timeout_secs=30, process_concurrency=process_concurrency, retries=self.sdk_retries, retry_exceptions=retry_exceptions) return tasks_info # Stopping and restarting the memcached process def stop_process(self): target_node = self.servers[2] remote = RemoteMachineShellConnection(target_node) error_sim = CouchbaseError(self.log, remote) error_to_simulate = "stop_memcached" # Induce the error condition error_sim.create(error_to_simulate) self.sleep(20, "Wait before reverting the error condition") # Revert the simulated error condition and close the ssh session error_sim.revert(error_to_simulate) remote.disconnect() def rebalance(self, nodes_in=0, nodes_out=0): servs_in = random.sample(self.available_servers, nodes_in) self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, nodes_out) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], servs_in, servs_out, check_vbucket_shuffling=self.vbucket_check) self.available_servers = [servs for servs in self.available_servers if servs not in servs_in] self.available_servers += servs_out self.cluster.nodes_in_cluster.extend(servs_in) self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) return rebalance_task def rebalance_validation(self, tasks_info, rebalance_task): if not rebalance_task.result: for task, _ in tasks_info.items(): self.task.jython_task_manager.get_task_result(task) self.fail("Rebalance Failed") def data_validation(self, tasks_info): if not self.atomicity: for task in tasks_info: self.task_manager.get_task_result(task) self.bucket_util.verify_doc_op_task_exceptions(tasks_info, self.cluster) self.bucket_util.log_doc_ops_task_failures(tasks_info) self.sleep(10) for task, task_info in tasks_info.items(): self.assertFalse( task_info["ops_failed"], "Doc ops failed for task: {}".format(task.thread_name)) self.log.info("Validating Active/Replica Docs") if self.atomicity: self.check_replica = False else: self.check_replica = True for bucket in self.bucket_util.buckets: tasks = list() if self.gen_update_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_update_users, "update", 0, batch_size=10, check_replica=self.check_replica)) if self.gen_create_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_create_users, "create", 0, batch_size=10, check_replica=self.check_replica)) if self.gen_delete_users is not None: tasks.append(self.task.async_validate_docs( self.cluster, bucket, self.gen_delete_users, "delete", 0, batch_size=10, check_replica=self.check_replica)) for task in tasks: self.task.jython_task_manager.get_task_result(task) self.sleep(20) if not self.atomicity: self.bucket_util._wait_for_stats_all_buckets() self.bucket_util.verify_stats_all_buckets(self.end - self.initial_load_count*self.delete_perc/100*self._iter_count) def data_load(self): tasks_info = dict() if self.atomicity: self.doc_load_using_txns() self.sleep(10) else: tasks_info = self.normal_doc_loader() self.sleep(10) return tasks_info def generate_docs(self): self.create_perc = self.input.param("create_perc",100) self.update_perc = self.input.param("update_perc", 10) self.delete_perc = self.input.param("delete_perc", 10) self.gen_delete_users = None self.gen_create_users = None self.gen_update_users = None if "update" in self.doc_ops: self.mutate += 1 self.gen_update_users = doc_generator("Users", 0, self.initial_load_count*self.update_perc/100, doc_size = self.doc_size, mutate = self.mutate) if "delete" in self.doc_ops: self.gen_delete_users = doc_generator("Users", self.start, self.start + (self.initial_load_count*self.delete_perc)/100, doc_size = self.doc_size) self._iter_count += 1 if "create" in self.doc_ops: self.start = self.end self.end += self.initial_load_count*self.create_perc/100 self.gen_create_users = doc_generator("Users", self.start, self.end, doc_size = self.doc_size) def data_validation_mode(self, tasks_info): # if not self.atomicity: self.data_validation(tasks_info) ''' else: for task in self.tasks: self.task.jython_task_manager.get_task_result(task) self.sleep(10) ''' def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format(bucket.name, dgm)) def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers(["Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted"]) if self._iter_count != 0: self.table.add_row([str(self.start - self.initial_load_count*self.delete_perc/100*(self._iter_count-1)), str(self.end- self.initial_load_count*self.delete_perc/100*self._iter_count), str(self.update_perc - self.update_perc) + "---" + str(self.initial_load_count*self.update_perc/100), str(self.start) + "---" + str(self.end), str(self.start - self.initial_load_count*self.create_perc/100) + "---" + str(self.start + (self.initial_load_count*self.delete_perc/100) - self.initial_load_count*self.create_perc/100)]) self.table.display("Docs statistics") def test_volume_taf(self): ######################################################################################################################## self.log.info("Step1: Create a n node cluster") nodes_init = self.cluster.servers[1:self.nodes_init] if self.nodes_init != 1 else [] self.task.rebalance([self.cluster.master], nodes_init, []) self.cluster.nodes_in_cluster.extend([self.cluster.master] + nodes_init) self.query_node = self.cluster.master ######################################################################################################################## self.log.info("Step 2 & 3: Create required buckets.") bucket = self.create_required_buckets() self.loop = 0 ####################################################################################################################### while self.loop<self.iterations: self.log.info("Step 4: Pre-Requisites for Loading of docs") self.start = 0 self.bucket_util.add_rbac_user() self.end = self.initial_load_count = self.input.param("initial_load", 1000) initial_load = doc_generator("Users", self.start, self.start + self.initial_load_count, doc_size=self.doc_size) self.initial_data_load(initial_load) self.tasks = [] self.bucket_util.print_bucket_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 5: Rebalance in with Loading of docs") self.generate_docs() self.gen_delete_users=None self._iter_count = 0 if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 1, nodes_out = 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################### self.log.info("Step 6: Rebalance Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 0, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ####################################################################################################################### self.log.info("Step 7: Rebalance In_Out with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in = 2, nodes_out = 1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 8: Swap with Loading of docs") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in=1, nodes_out=1) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 9: Updating the bucket replica to 2") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=2) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") rebalance_task = self.rebalance(nodes_in =1, nodes_out= 0) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## if "ephemeral" in self.bucket_type: self.log.info("No Memcached kill for epehemral bucket") else: self.log.info("Step 10: Stopping and restarting memcached process") self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.stop_process() self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 11: Failover a node and RebalanceOut that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) # Mark Node for failover self.generate_docs() tasks_info = self.data_load() self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) self.nodes = self.rest.node_statuses() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[self.chosen[0].id]) # self.sleep(600) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg="Rebalance failed") servs_out = [node for node in self.cluster.servers if node.ip == self.chosen[0].ip] self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) self.available_servers += servs_out self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] rebalance_task = self.rebalance(nodes_in=1, nodes_out=0) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 12: Failover a node and FullRecovery that node") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) # Mark Node for full recovery if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="full") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.sleep(10) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 13: Failover a node and DeltaRecovery that node with loading in parallel") self.std_vbucket_dist = self.input.param("std_vbucket_dist", None) std = self.std_vbucket_dist or 1.0 prev_failover_stats = self.bucket_util.get_failovers_logs(self.cluster.nodes_in_cluster, self.bucket_util.buckets) prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) disk_replica_dataset, disk_active_dataset = self.bucket_util.get_and_compare_active_replica_data_set_all( self.cluster.nodes_in_cluster, self.bucket_util.buckets, path=None) self.rest = RestConnection(self.cluster.master) self.nodes = self.cluster_util.get_nodes(self.cluster.master) self.chosen = self.cluster_util.pick_nodes(self.cluster.master, howmany=1) self.generate_docs() tasks_info = self.data_load() # Mark Node for failover self.success_failed_over = self.rest.fail_over(self.chosen[0].id, graceful=False) self.sleep(300) if self.success_failed_over: self.rest.set_recovery_type(otpNode=self.chosen[0].id, recoveryType="delta") if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], []) if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(10) self.data_validation_mode(tasks_info) self.bucket_util.compare_failovers_logs(prev_failover_stats, self.cluster.nodes_in_cluster, self.bucket_util.buckets) self.sleep(10) self.bucket_util.data_analysis_active_replica_all( disk_active_dataset, disk_replica_dataset, self.cluster.servers[:self.nodes_in + self.nodes_init], self.bucket_util.buckets, path=None) nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master) self.bucket_util.vb_distribution_analysis( servers=nodes, buckets=self.bucket_util.buckets, num_replicas=2, std=std, total_vbuckets=self.cluster_util.vbuckets) self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 14: Updating the bucket replica to 1") bucket_helper = BucketHelper(self.cluster.master) for i in range(len(self.bucket_util.buckets)): bucket_helper.change_bucket_props( self.bucket_util.buckets[i], replicaNumber=1) self.generate_docs() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads=self.new_num_writer_threads, num_reader_threads=self.new_num_reader_threads) rebalance_task = self.task.async_rebalance(self.cluster.servers, [], []) tasks_info = self.data_load() if not self.atomicity: self.set_num_writer_and_reader_threads(num_writer_threads="disk_io_optimized", num_reader_threads="disk_io_optimized") # self.sleep(600, "Wait for Rebalance to start") self.task.jython_task_manager.get_task_result(rebalance_task) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.data_validation_mode(tasks_info) self.tasks = [] self.bucket_util.print_bucket_stats() self.print_crud_stats() self.get_bucket_dgm(bucket) ######################################################################################################################## self.log.info("Step 15: Flush the bucket and start the entire process again") self.loop += 1 if self.loop < self.iterations: # Flush the bucket self.bucket_util.flush_all_buckets(self.cluster.master) self.sleep(10) if len(self.cluster.nodes_in_cluster) > self.nodes_init: self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) servs_out = random.sample(self.nodes_cluster, int(len(self.cluster.nodes_in_cluster) - self.nodes_init)) rebalance_task = self.task.async_rebalance( self.cluster.servers[:self.nodes_init], [], servs_out) # self.sleep(600) self.task.jython_task_manager.get_task_result(rebalance_task) self.available_servers += servs_out self.cluster.nodes_in_cluster = list(set(self.cluster.nodes_in_cluster) - set(servs_out)) reached = RestHelper(self.rest).rebalance_reached(wait_step=120) self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.get_bucket_dgm(bucket) self._iter_count = 0 else: self.log.info("Volume Test Run Complete") self.get_bucket_dgm(bucket)
class OPD: def __init__(self): pass def threads_calculation(self): self.process_concurrency = self.input.param("pc", self.process_concurrency) self.doc_loading_tm = TaskManager(self.process_concurrency) def get_memory_footprint(self): out = subprocess.Popen( ['ps', 'v', '-p', str(os.getpid())], stdout=subprocess.PIPE).communicate()[0].split(b'\n') vsz_index = out[0].split().index(b'RSS') mem = float(out[1].split()[vsz_index]) / 1024 self.PrintStep("RAM FootPrint: %s" % str(mem)) return mem def create_required_buckets(self, cluster): if self.cluster.cloud_cluster: return self.log.info("Get the available memory quota") rest = RestConnection(cluster.master) self.info = rest.get_nodes_self() # threshold_memory_vagrant = 100 kv_memory = self.info.memoryQuota - 100 # Creating buckets for data loading purpose self.log.info("Create CB buckets") self.bucket_expiry = self.input.param("bucket_expiry", 0) ramQuota = self.input.param("ramQuota", kv_memory) buckets = ["GleamBookUsers"] * self.num_buckets bucket_type = self.bucket_type.split(';') * self.num_buckets compression_mode = self.compression_mode.split(';') * self.num_buckets self.bucket_eviction_policy = self.bucket_eviction_policy for i in range(self.num_buckets): bucket = Bucket({ Bucket.name: buckets[i] + str(i), Bucket.ramQuotaMB: ramQuota / self.num_buckets, Bucket.maxTTL: self.bucket_expiry, Bucket.replicaNumber: self.num_replicas, Bucket.storageBackend: self.bucket_storage, Bucket.evictionPolicy: self.bucket_eviction_policy, Bucket.bucketType: bucket_type[i], Bucket.flushEnabled: Bucket.FlushBucket.ENABLED, Bucket.compressionMode: compression_mode[i], Bucket.fragmentationPercentage: self.fragmentation }) self.bucket_util.create_bucket(cluster, bucket) # rebalance the new buckets across all nodes. self.log.info("Rebalance Starts") self.nodes = rest.node_statuses() rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) rest.monitorRebalance() def create_required_collections(self, cluster, num_scopes, num_collections): self.scope_name = self.input.param("scope_name", "_default") if self.scope_name != "_default": self.bucket_util.create_scope(cluster, self.bucket, {"name": self.scope_name}) if num_scopes > 1: self.scope_prefix = self.input.param("scope_prefix", "VolumeScope") for bucket in cluster.buckets: for i in range(num_scopes): scope_name = self.scope_prefix + str(i) self.log.info("Creating scope: %s" % (scope_name)) self.bucket_util.create_scope(cluster.master, bucket, {"name": scope_name}) self.sleep(0.5) self.num_scopes += 1 for bucket in cluster.buckets: for scope in bucket.scopes.keys(): if num_collections > 0: self.collection_prefix = self.input.param( "collection_prefix", "VolumeCollection") for i in range(num_collections): collection_name = self.collection_prefix + str(i) self.bucket_util.create_collection( cluster.master, bucket, scope, {"name": collection_name}) self.sleep(0.5) self.collections = cluster.buckets[0].scopes[ self.scope_name].collections.keys() self.log.debug("Collections list == {}".format(self.collections)) def stop_purger(self, tombstone_purge_age=60): """ 1. Disable ts purger 2. Create fts indexes (to create metakv, ns_config entries) 3. Delete fts indexes 4. Grep ns_config for '_deleted' to get total deleted keys count 5. enable ts purger and age = 1 mins 6. Sleep for 2 minutes 7. Grep for debug.log and check for latest tombstones purged count 8. Validate step4 count matches step 7 count for all nodes """ self.rest.update_tombstone_purge_age_for_removal(tombstone_purge_age) self.rest.disable_tombstone_purger() def get_bucket_dgm(self, bucket): self.rest_client = BucketHelper(self.cluster.master) dgm = self.rest_client.fetch_bucket_stats( bucket.name)["op"]["samples"]["vb_active_resident_items_ratio"][-1] self.log.info("Active Resident Threshold of {0} is {1}".format( bucket.name, dgm)) return dgm def _induce_error(self, error_condition, nodes=[]): nodes = nodes or [self.cluster.master] for node in nodes: if error_condition == "stop_server": self.cluster_util.stop_server(node) elif error_condition == "enable_firewall": self.cluster_util.start_firewall_on_node(node) elif error_condition == "kill_memcached": shell = RemoteMachineShellConnection(node) shell.kill_memcached() shell.disconnect() elif error_condition == "reboot_server": shell = RemoteMachineShellConnection(node) shell.reboot_node() elif error_condition == "kill_erlang": shell = RemoteMachineShellConnection(node) shell.kill_erlang() shell.disconnect() else: self.fail("Invalid error induce option") def _recover_from_error(self, error_condition): for node in self.cluster.nodes_in_cluster: if error_condition == "stop_server" or error_condition == "kill_erlang": self.cluster_util.start_server(node) elif error_condition == "enable_firewall": self.cluster_util.stop_firewall_on_node(node) for node in self.cluster.kv_nodes + [self.cluster.master]: self.check_warmup_complete(node) result = self.cluster_util.wait_for_ns_servers_or_assert( [node], wait_time=1200) self.assertTrue(result, "Server warmup failed") def rebalance(self, nodes_in=0, nodes_out=0, services=[], retry_get_process_num=3000): self.servs_in = list() self.nodes_cluster = self.cluster.nodes_in_cluster[:] self.nodes_cluster.remove(self.cluster.master) self.servs_out = list() services = services or ["kv"] print "KV nodes in cluster: %s" % [ server.ip for server in self.cluster.kv_nodes ] print "CBAS nodes in cluster: %s" % [ server.ip for server in self.cluster.cbas_nodes ] print "INDEX nodes in cluster: %s" % [ server.ip for server in self.cluster.index_nodes ] print "FTS nodes in cluster: %s" % [ server.ip for server in self.cluster.fts_nodes ] print "QUERY nodes in cluster: %s" % [ server.ip for server in self.cluster.query_nodes ] print "EVENTING nodes in cluster: %s" % [ server.ip for server in self.cluster.eventing_nodes ] print "AVAILABLE nodes for cluster: %s" % [ server.ip for server in self.available_servers ] if nodes_out: if "cbas" in services: servers = random.sample(self.cluster.cbas_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.cbas_nodes.remove(server) if "index" in services: servers = random.sample(self.cluster.index_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.index_nodes.remove(server) if "fts" in services: servers = random.sample(self.cluster.fts_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.fts_nodes.remove(server) if "query" in services: servers = random.sample(self.cluster.query_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.query_nodes.remove(server) if "eventing" in services: servers = random.sample(self.cluster.eventing_nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.eventing_nodes.remove(server) if "kv" in services: nodes = [ node for node in self.cluster.kv_nodes if node.ip != self.cluster.master.ip ] servers = random.sample(nodes, nodes_out) self.servs_out.extend(servers) for server in servers: self.cluster.kv_nodes.remove(server) if nodes_in: if "cbas" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.cbas_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "index" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.index_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "fts" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.fts_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "query" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.query_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "eventing" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.eventing_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] if "kv" in services: servers = random.sample(self.available_servers, nodes_in) self.servs_in.extend(servers) self.cluster.kv_nodes.extend(servers) self.available_servers = [ servs for servs in self.available_servers if servs not in servers ] print "Servers coming in : %s with services: %s" % ( [server.ip for server in self.servs_in], services) print "Servers going out : %s" % ( [server.ip for server in self.servs_out]) self.available_servers.extend(self.servs_out) print "NEW AVAILABLE nodes for cluster: %s" % ( [server.ip for server in self.available_servers]) if nodes_in == nodes_out: self.vbucket_check = False rebalance_task = self.task.async_rebalance( self.cluster, self.servs_in, self.servs_out, services=services, check_vbucket_shuffling=self.vbucket_check, retry_get_process_num=retry_get_process_num) return rebalance_task def generate_docs(self, doc_ops=None, create_end=None, create_start=None, update_end=None, update_start=None, delete_end=None, delete_start=None, expire_end=None, expire_start=None, read_end=None, read_start=None): self.get_memory_footprint() self.create_end = 0 self.create_start = 0 self.read_end = 0 self.read_start = 0 self.update_end = 0 self.update_start = 0 self.delete_end = 0 self.delete_start = 0 self.expire_end = 0 self.expire_start = 0 self.initial_items = self.final_items doc_ops = doc_ops or self.doc_ops self.mutations_to_validate = doc_ops if "read" in doc_ops: if read_start is not None: self.read_start = read_start else: self.read_start = 0 if read_end is not None: self.read_end = read_end else: self.read_end = self.num_items * self.mutation_perc / 100 if "update" in doc_ops: if update_start is not None: self.update_start = update_start else: self.update_start = 0 if update_end is not None: self.update_end = update_end else: self.update_end = self.num_items * self.mutation_perc / 100 self.mutate += 1 if "delete" in doc_ops: if delete_start is not None: self.delete_start = delete_start else: self.delete_start = self.start if delete_end is not None: self.delete_end = delete_end else: self.delete_end = self.start + self.num_items * self.mutation_perc / 100 self.final_items -= (self.delete_end - self.delete_start ) * self.num_collections * self.num_scopes if "expiry" in doc_ops: if self.maxttl == 0: self.maxttl = self.input.param("maxttl", 10) if expire_start is not None: self.expire_start = expire_start else: self.expire_start = self.delete_end if expire_end is not None: self.expire_end = expire_end else: self.expire_end = self.expire_start + self.num_items * self.mutation_perc / 100 self.final_items -= (self.expire_end - self.expire_start ) * self.num_collections * self.num_scopes if "create" in doc_ops: if create_start is not None: self.create_start = create_start else: self.create_start = self.end self.start = self.create_start if create_end is not None: self.create_end = create_end else: self.create_end = self.end + ( self.expire_end - self.expire_start) + (self.delete_end - self.delete_start) self.end = self.create_end self.final_items += (abs(self.create_end - self.create_start) ) * self.num_collections * self.num_scopes print "Read Start: %s" % self.read_start print "Read End: %s" % self.read_end print "Update Start: %s" % self.update_start print "Update End: %s" % self.update_end print "Expiry Start: %s" % self.expire_start print "Expiry End: %s" % self.expire_end print "Delete Start: %s" % self.delete_start print "Delete End: %s" % self.delete_end print "Create Start: %s" % self.create_start print "Create End: %s" % self.create_end print "Final Start: %s" % self.start print "Final End: %s" % self.end def _loader_dict(self, cmd={}): self.loader_map = dict() for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue ws = WorkLoadSettings( cmd.get("keyPrefix", self.key), cmd.get("keySize", self.key_size), cmd.get("docSize", self.doc_size), cmd.get("cr", self.create_perc), cmd.get("rd", self.read_perc), cmd.get("up", self.update_perc), cmd.get("dl", self.delete_perc), cmd.get("ex", self.expiry_perc), cmd.get("workers", self.process_concurrency), cmd.get("ops", self.ops_rate), cmd.get("loadType", None), cmd.get("keyType", None), cmd.get("valueType", None), cmd.get("validate", False), cmd.get("gtm", False), cmd.get("deleted", False), cmd.get("mutated", 0)) hm = HashMap() hm.putAll({ DRConstants.create_s: self.create_start, DRConstants.create_e: self.create_end, DRConstants.update_s: self.update_start, DRConstants.update_e: self.update_end, DRConstants.expiry_s: self.expire_start, DRConstants.expiry_e: self.expire_end, DRConstants.delete_s: self.delete_start, DRConstants.delete_e: self.delete_end, DRConstants.read_s: self.read_start, DRConstants.read_e: self.read_end }) dr = DocRange(hm) ws.dr = dr dg = DocumentGenerator(ws, self.key_type, self.val_type) self.loader_map.update( {bucket.name + scope + collection: dg}) def wait_for_doc_load_completion(self, tasks, wait_for_stats=True): self.doc_loading_tm.getAllTaskResult() self.get_memory_footprint() for task in tasks: task.result = True unique_str = "{}:{}:{}:".format(task.sdk.bucket, task.sdk.scope, task.sdk.collection) for optype, failures in task.failedMutations.items(): for failure in failures: if failure is not None: print("Test Retrying {}: {}{} -> {}".format( optype, unique_str, failure.id(), failure.err().getClass().getSimpleName())) try: if optype == "create": task.docops.insert(failure.id(), failure.document(), task.sdk.connection, task.setOptions) if optype == "update": task.docops.upsert(failure.id(), failure.document(), task.sdk.connection, task.upsertOptions) if optype == "delete": task.docops.delete(failure.id(), task.sdk.connection, task.removeOptions) except (ServerOutOfMemoryException, TimeoutException) as e: print("Retry {} failed for key: {} - {}".format( optype, failure.id(), e)) task.result = False except (DocumentNotFoundException, DocumentExistsException) as e: pass try: task.sdk.disconnectCluster() except Exception as e: print(e) self.assertTrue(task.result, "Task Failed: {}".format(task.taskName)) if wait_for_stats: try: self.bucket_util._wait_for_stats_all_buckets( self.cluster, self.cluster.buckets, timeout=14400) if self.track_failures: self.bucket_util.verify_stats_all_buckets(self.cluster, self.final_items, timeout=14400) except Exception as e: if not self.cluster.cloud_cluster: self.get_gdb() raise e def get_gdb(self): for node in self.cluster.nodes_in_cluster: gdb_shell = RemoteMachineShellConnection(node) gdb_out = gdb_shell.execute_command( 'gdb -p `(pidof memcached)` -ex "thread apply all bt" -ex detach -ex quit' )[0] print node.ip print gdb_out gdb_shell.disconnect() def data_validation(self): self.get_memory_footprint() doc_ops = self.mutations_to_validate pc = min(self.process_concurrency, 20) if self._data_validation: self.log.info("Validating Active/Replica Docs") cmd = dict() self.ops_rate = self.input.param("ops_rate", 2000) master = Server(self.cluster.master.ip, self.cluster.master.port, self.cluster.master.rest_username, self.cluster.master.rest_password, str(self.cluster.master.memcached_port)) self.loader_map = dict() for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue for op_type in doc_ops: cmd.update({"deleted": False}) hm = HashMap() if op_type == "create": hm.putAll({ DRConstants.read_s: self.create_start, DRConstants.read_e: self.create_end }) elif op_type == "update": hm.putAll({ DRConstants.read_s: self.update_start, DRConstants.read_e: self.update_end }) elif op_type == "delete": hm.putAll({ DRConstants.read_s: self.delete_start, DRConstants.read_e: self.delete_end }) cmd.update({"deleted": True}) else: continue dr = DocRange(hm) ws = WorkLoadSettings( cmd.get("keyPrefix", self.key), cmd.get("keySize", self.key_size), cmd.get("docSize", self.doc_size), cmd.get("cr", 0), cmd.get("rd", 100), cmd.get("up", 0), cmd.get("dl", 0), cmd.get("ex", 0), cmd.get("workers", pc), cmd.get("ops", self.ops_rate), cmd.get("loadType", None), cmd.get("keyType", None), cmd.get("valueType", None), cmd.get("validate", True), cmd.get("gtm", False), cmd.get("deleted", False), cmd.get("mutated", 0)) ws.dr = dr dg = DocumentGenerator(ws, self.key_type, self.val_type) self.loader_map.update({ bucket.name + scope + collection + op_type: dg }) tasks = list() i = pc while i > 0: for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[ scope].collections.keys(): if collection == "_default" and scope == "_default": continue for op_type in doc_ops: if op_type not in [ "create", "update", "delete" ]: continue client = NewSDKClient(master, bucket.name, scope, collection) client.initialiseSDK() self.sleep(1) taskName = "Validate_%s_%s_%s_%s_%s_%s" % ( bucket.name, scope, collection, op_type, str(i), time.time()) task = WorkLoadGenerate( taskName, self.loader_map[bucket.name + scope + collection + op_type], client, "NONE", self.maxttl, self.time_unit, self.track_failures, 0) tasks.append(task) self.doc_loading_tm.submit(task) i -= 1 self.doc_loading_tm.getAllTaskResult() for task in tasks: try: task.sdk.disconnectCluster() except Exception as e: print(e) for task in tasks: self.assertTrue(task.result, "Validation Failed for: %s" % task.taskName) self.get_memory_footprint() def print_crud_stats(self): self.table = TableView(self.log.info) self.table.set_headers([ "Initial Items", "Current Items", "Items Updated", "Items Created", "Items Deleted", "Items Expired" ]) self.table.add_row([ str(self.initial_items), str(self.final_items), str(abs(self.update_start)) + "-" + str(abs(self.update_end)), str(abs(self.create_start)) + "-" + str(abs(self.create_end)), str(abs(self.delete_start)) + "-" + str(abs(self.delete_end)), str(abs(self.expire_start)) + "-" + str(abs(self.expire_end)) ]) self.table.display("Docs statistics") def perform_load(self, crash=False, num_kills=1, wait_for_load=True, validate_data=True): self.get_memory_footprint() self._loader_dict() master = Server(self.cluster.master.ip, self.cluster.master.port, self.cluster.master.rest_username, self.cluster.master.rest_password, str(self.cluster.master.memcached_port)) tasks = list() i = self.process_concurrency while i > 0: for bucket in self.cluster.buckets: for scope in bucket.scopes.keys(): for collection in bucket.scopes[scope].collections.keys(): if collection == "_default" and scope == "_default": continue client = NewSDKClient(master, bucket.name, scope, collection) client.initialiseSDK() self.sleep(1) self.get_memory_footprint() taskName = "Loader_%s_%s_%s_%s_%s" % ( bucket.name, scope, collection, str(i), time.time()) task = WorkLoadGenerate( taskName, self.loader_map[bucket.name + scope + collection], client, self.durability_level, self.maxttl, self.time_unit, self.track_failures, 0) tasks.append(task) self.doc_loading_tm.submit(task) i -= 1 if wait_for_load: self.wait_for_doc_load_completion(tasks) self.get_memory_footprint() else: return tasks if crash: self.kill_memcached(num_kills=num_kills) if validate_data: self.data_validation() self.print_stats() if self.cluster.cloud_cluster: return result = self.check_coredump_exist(self.cluster.nodes_in_cluster) if result: self.PrintStep("CRASH | CRITICAL | WARN messages found in cb_logs") if self.assert_crashes_on_load: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse(result) def get_magma_disk_usage(self, bucket=None): if bucket is None: bucket = self.bucket servers = self.cluster.nodes_in_cluster kvstore = 0 wal = 0 keyTree = 0 seqTree = 0 data_files = 0 for server in servers: shell = RemoteMachineShellConnection(server) bucket_path = os.path.join( RestConnection(server).get_data_path(), bucket.name) kvstore += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*"))[0][0].split('\n')[0]) wal += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/wal"))[0][0].split('\n')[0]) keyTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*/rev*/key*"))[0][0].split('\n')[0]) seqTree += int( shell.execute_command("du -cm %s | tail -1 | awk '{print $1}'\ " % os.path.join(bucket_path, "magma.*/kv*/rev*/seq*"))[0][0].split('\n')[0]) cmd = 'find ' + bucket_path + '/magma*/ -maxdepth 1 -type d \ -print0 | while read -d "" -r dir; do files=("$dir"/*/*/*); \ printf "%d,%s\n" "${#files[@]}" "$dir"; done' data_files = shell.execute_command(cmd)[0] for files in data_files: if "kvstore" in files and int(files.split(",")[0]) >= 300: self.log.warn("Number of files in {}--{} is {}".format( server.ip, files.split(",")[1].rstrip(), files.split(",")[0])) shell.disconnect() self.log.debug("Total Disk usage for kvstore is {}MB".format(kvstore)) self.log.debug("Total Disk usage for wal is {}MB".format(wal)) self.log.debug("Total Disk usage for keyTree is {}MB".format(keyTree)) self.log.debug("Total Disk usage for seqTree is {}MB".format(seqTree)) return kvstore, wal, keyTree, seqTree def print_stats(self): self.bucket_util.print_bucket_stats(self.cluster) self.cluster_util.print_cluster_stats(self.cluster) self.print_crud_stats() for bucket in self.cluster.buckets: self.get_bucket_dgm(bucket) if bucket.storageBackend == Bucket.StorageBackend.magma and not self.cluster.cloud_cluster: self.get_magma_disk_usage(bucket) self.check_fragmentation_using_magma_stats(bucket) self.check_fragmentation_using_kv_stats(bucket) def PrintStep(self, msg=None): print "\n" print "\t", "#" * 60 print "\t", "#" print "\t", "# %s" % msg print "\t", "#" print "\t", "#" * 60 print "\n" def check_fragmentation_using_kv_stats(self, bucket, servers=None): result = dict() if servers is None: servers = self.cluster.kv_nodes + [self.cluster.master] if type(servers) is not list: servers = [servers] for server in servers: frag_val = self.bucket_util.get_fragmentation_kv( self.cluster, bucket, server) self.log.debug("Current Fragmentation for node {} is {} \ ".format(server.ip, frag_val)) result.update({server.ip: frag_val}) self.log.info("KV stats fragmentation values {}".format(result)) def dump_magma_stats(self, server, bucket, shard, kvstore): if bucket.storageBackend != Bucket.StorageBackend.magma or self.cluster.cloud_cluster: return shell = RemoteMachineShellConnection(server) data_path = RestConnection(server).get_data_path() while not self.stop_stats: for bucket in self.cluster.buckets: self.log.info( self.get_magma_stats(bucket, server, "rw_0:magma")) self.dump_seq_index(shell, data_path, bucket.name, shard, kvstore) self.sleep(600) shell.disconnect() def dump_seq_index(self, shell, data_path, bucket, shard, kvstore): magma_path = os.path.join(data_path, bucket, "magma.{}") magma = magma_path.format(shard) cmd = '/opt/couchbase/bin/magma_dump {}'.format(magma) cmd += ' --kvstore {} --tree seq'.format(kvstore) result = shell.execute_command(cmd)[0] self.log.info("Seq Tree for {}:{}:{}:{}: \n{}".format( shell.ip, bucket, shard, kvstore, result)) def check_fragmentation_using_magma_stats(self, bucket, servers=None): result = dict() stats = list() if servers is None: servers = self.cluster.kv_nodes + [self.cluster.master] if type(servers) is not list: servers = [servers] for server in servers: fragmentation_values = list() shell = RemoteMachineShellConnection(server) output = shell.execute_command( "lscpu | grep 'CPU(s)' | head -1 | awk '{print $2}'" )[0][0].split('\n')[0] shell.disconnect() self.log.debug("machine: {} - core(s): {}".format( server.ip, output)) for i in range(min(int(output), 64)): grep_field = "rw_{}:magma".format(i) _res = self.get_magma_stats(bucket, server) fragmentation_values.append( json.loads(_res[server.ip][grep_field])["Fragmentation"]) stats.append(_res) result.update({server.ip: fragmentation_values}) self.log.info(stats[0]) res = list() for value in result.values(): res.append(max(value)) if max(res) < float(self.fragmentation) / 100: self.log.info("magma stats fragmentation result {} \ ".format(result)) return True self.log.info("magma stats fragmentation result {} \ ".format(result)) return False def get_magma_stats(self, bucket, server=None): magma_stats_for_all_servers = dict() cbstat_obj = Cbstats(server) result = cbstat_obj.magma_stats(bucket.name) magma_stats_for_all_servers[server.ip] = result return magma_stats_for_all_servers def pause_rebalance(self): rest = RestConnection(self.cluster.master) i = 1 self.sleep(10, "Let the rebalance begin!") expected_progress = 20 while expected_progress < 100: expected_progress = 20 * i reached = self.cluster_util.rebalance_reached( rest, expected_progress) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format( expected_progress)) if not self.cluster_util.is_cluster_rebalanced(rest): self.log.info("Stop the rebalance") stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3) self.assertTrue(stopped, msg="Unable to stop rebalance") rebalance_task = self.task.async_rebalance( self.cluster, [], [], retry_get_process_num=3000) self.sleep( 10, "Rebalance % ={}. Let the rebalance begin!".format( expected_progress)) i += 1 return rebalance_task def abort_rebalance(self, rebalance, error_type="kill_memcached"): self.sleep(30, "Let the rebalance begin!") rest = RestConnection(self.cluster.master) i = 1 expected_progress = 20 rebalance_task = rebalance while expected_progress < 80: expected_progress = 20 * i reached = self.cluster_util.rebalance_reached(rest, expected_progress, wait_step=10, num_retry=3600) self.assertTrue( reached, "Rebalance failed or did not reach {0}%".format( expected_progress)) if not self.cluster_util.is_cluster_rebalanced(rest): self.log.info("Abort rebalance") self._induce_error(error_type, self.cluster.nodes_in_cluster) result = self.check_coredump_exist( self.cluster.nodes_in_cluster) if result: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") self.sleep(60, "Sleep after error introduction") self._recover_from_error(error_type) result = self.check_coredump_exist( self.cluster.nodes_in_cluster) if result: self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") try: self.task_manager.get_task_result(rebalance_task) except RebalanceFailedException: pass if rebalance.result: self.log.error( "Rebalance passed/finished which is not expected") self.log.info( "Rebalance % after rebalance finished = {}".format( expected_progress)) return None else: self.log.info( "Restarting Rebalance after killing at {}".format( expected_progress)) rebalance_task = self.task.async_rebalance( self.cluster, [], self.servs_out, retry_get_process_num=3000) self.sleep(120, "Let the rebalance begin after abort") self.log.info("Rebalance % = {}".format( self.rest._rebalance_progress())) i += 1 return rebalance_task def crash_memcached(self, nodes=None, num_kills=1, graceful=False): self.stop_crash = False self.crash_count = 0 if not nodes: nodes = self.cluster.kv_nodes + [self.cluster.master] while not self.stop_crash: self.get_memory_footprint() sleep = random.randint(60, 120) self.sleep( sleep, "Iteration:{} waiting to kill memc on all nodes".format( self.crash_count)) self.kill_memcached(nodes, num_kills=num_kills, graceful=graceful, wait=True) self.crash_count += 1 if self.crash_count > self.crashes: self.stop_crash = True self.sleep(300) def kill_memcached(self, servers=None, num_kills=1, graceful=False, wait=True): if not servers: servers = self.cluster.kv_nodes + [self.cluster.master] for server in servers: for _ in xrange(num_kills): if num_kills > 1: self.sleep( 2, "Sleep for 2 seconds b/w cont memc kill on same node.") shell = RemoteMachineShellConnection(server) if graceful: shell.restart_couchbase() else: shell.kill_memcached() shell.disconnect() self.sleep( 5, "Sleep for 5 seconds before killing memc on next node.") result = self.check_coredump_exist(self.cluster.nodes_in_cluster) if result: self.stop_crash = True self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertFalse( result, "CRASH | CRITICAL | WARN messages found in cb_logs") if wait: for server in servers: self.check_warmup_complete(server) def check_warmup_complete(self, server): for bucket in self.cluster.buckets: start_time = time.time() result = self.bucket_util._wait_warmup_completed( [server], self.cluster.buckets[0], wait_time=self.wait_timeout * 20) if not result: self.stop_crash = True self.task_manager.abort_all_tasks() self.doc_loading_tm.abortAllTasks() self.assertTrue( result, "Warm-up failed in %s seconds" % (self.wait_timeout * 20)) else: self.log.info("Bucket:%s warm-up completed in %s." % (bucket.name, str(time.time() - start_time))) def set_num_writer_and_reader_threads(self, num_writer_threads="default", num_reader_threads="default", num_storage_threads="default"): bucket_helper = BucketHelper(self.cluster.master) bucket_helper.update_memcached_settings( num_writer_threads=num_writer_threads, num_reader_threads=num_reader_threads, num_storage_threads=num_storage_threads)