def wait_for_ns_servers_or_assert(servers, testcase, wait_time=360, wait_if_warmup=False): for server in servers: rest = RestConnection(server) log = logger.Logger.get_logger() log.info("waiting for ns_server @ {0}:{1}".format(server.ip, server.port)) if RestHelper(rest).is_ns_server_running(wait_time): log.info("ns_server @ {0}:{1} is running".format(server.ip, server.port)) elif wait_if_warmup: # wait when warmup completed buckets = rest.get_buckets() for bucket in buckets: testcase.assertTrue(ClusterOperationHelper._wait_warmup_completed(testcase, \ [server], bucket.name, wait_time), "warmup was not completed!") else: testcase.fail("ns_server {0} is not running in {1} sec".format(server.ip, wait_time))
def wait_for_vbuckets_ready_state(node, bucket, timeout_in_seconds=300, log_msg=''): log = logger.Logger.get_logger() start_time = time.time() end_time = start_time + timeout_in_seconds ready_vbuckets = {} rest = RestConnection(node) servers = rest.get_nodes() RestHelper(rest).vbucket_map_ready(bucket, 60) vbucket_count = len(rest.get_vbuckets(bucket)) vbuckets = rest.get_vbuckets(bucket) obj = VBucketAwareMemcached(rest, bucket) memcacheds, vbucket_map, vbucket_map_replica = obj.request_map( rest, bucket) #Create dictionary with key:"ip:port" and value: a list of vbuckets server_dict = defaultdict(list) for everyID in range(0, vbucket_count): memcached_ip_port = str(vbucket_map[everyID]) server_dict[memcached_ip_port].append(everyID) while time.time() < end_time and len(ready_vbuckets) < vbucket_count: for every_ip_port in server_dict: #Retrieve memcached ip and port ip, port = every_ip_port.split(":") client = MemcachedClient(ip, int(port), timeout=30) client.vbucket_count = len(vbuckets) bucket_info = rest.get_bucket(bucket) client.sasl_auth_plain( bucket_info.name.encode('ascii'), bucket_info.saslPassword.encode('ascii')) for i in server_dict[every_ip_port]: try: (a, b, c) = client.get_vbucket_state(i) except mc_bin_client.MemcachedError as e: log.error("%s: %s" % (log_msg, e)) continue if c.find("\x01") > 0 or c.find("\x02") > 0: ready_vbuckets[i] = True elif i in ready_vbuckets: log.warning( "vbucket state changed from active to {0}".format( c)) del ready_vbuckets[i] client.close() return len(ready_vbuckets) == vbucket_count
def _create_default_bucket(self): name = "default" master = self.servers[0] rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio( self.servers) info = rest.get_nodes_self() available_ram = info.mcdMemoryReserved * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram)) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name)) self.load_thread = None self.shutdown_load_data = False
def _create_default_bucket(self): helper = RestHelper(self.rest) if not helper.bucket_exists(self.bucket): node_ram_ratio = BucketOperationHelper.base_bucket_ratio( self.servers) info = self.rest.get_nodes_self() available_ram = int(info.memoryQuota * node_ram_ratio) if available_ram < 256: available_ram = 256 self.rest.create_bucket(bucket=self.bucket, ramQuotaMB=available_ram) ready = BucketOperationHelper.wait_for_memcached( self.master, self.bucket) self.testcase.assertTrue(ready, "wait_for_memcached failed") self.testcase.assertTrue( helper.bucket_exists(self.bucket), "unable to create {0} bucket".format(self.bucket))
def rebalance_out(self, how_many): msg = "choosing three nodes and rebalance them out from the cluster" self.log.info(msg) rest = RestConnection(self._servers[0]) nodes = rest.node_statuses() nodeIps = [node.ip for node in nodes] self.log.info("current nodes : {0}".format(nodeIps)) toBeEjected = [] toBeEjectedServers = [] selection = self._servers[1:] shuffle(selection) for server in selection: for node in nodes: if server.ip == node.ip: toBeEjected.append(node.id) toBeEjectedServers.append(server) break if len(toBeEjected) == how_many: break if len(toBeEjected) > 0: self.log.info( "selected {0} for rebalance out from the cluster".format( toBeEjected)) otpNodes = [node.id for node in nodes] started = rest.rebalance(otpNodes, toBeEjected) msg = "rebalance operation started ? {0}" self.log.info(msg.format(started)) if started: result = rest.monitorRebalance() msg = "successfully rebalanced out selected nodes from the cluster ? {0}" self.log.info(msg.format(result)) for server in toBeEjectedServers: shell = RemoteMachineShellConnection(server) try: shell.stop_membase() except: pass try: shell.start_membase() except: pass shell.disconnect() RestHelper(RestConnection(server)).is_ns_server_running() #let's restart membase on those nodes return result return True
def wait_for_ns_servers_or_assert(self, servers, wait_time=360, wait_if_warmup=False): for server in servers: rest = RestConnection(server) self.log.debug("Waiting for ns_server @ {0}:{1}".format( server.ip, server.port)) if RestHelper(rest).is_ns_server_running(wait_time): self.log.debug("ns_server @ {0}:{1} is running".format( server.ip, server.port)) else: self.log.error( "ns_server {0} is not running in {1} sec".format( server.ip, wait_time)) return False return True
def test_create_bucket_used_port(self): ports = [25, 68, 80, 135, 139, 143, 500] for port in ports: try: self.cluster.create_standard_bucket( self.server, self.bucket_name + str(port), port, self.bucket_size, self.num_replicas) except: self.log.info('Error appears as expected') rest = RestConnection(self.master) self.assertTrue( RestHelper(rest).is_ns_server_running( timeout_in_seconds=60)) else: raise Exception( 'User has to be unable to create a bucket using port %s' % port)
def wipe_config_on_removed_nodes(self, remove_nodes): """ Stop servers on nodes that were failed over and removed, and wipe config dir """ for node in remove_nodes: self.log.info("Wiping node config and restarting server on {0}".format(node)) rest = RestConnection(node) data_path = rest.get_data_path() shell = RemoteMachineShellConnection(node) shell.stop_couchbase() self.sleep(10) shell.cleanup_data_config(data_path) shell.start_server() self.sleep(10) if not RestHelper(rest).is_ns_server_running(): self.log.error("ns_server {0} is not running.".format(node.ip)) shell.disconnect()
def _create_default_bucket(self, replica=1): name = "default" master = self.servers[0] rest = RestConnection(master) helper = RestHelper(RestConnection(master)) if not helper.bucket_exists(name): node_ram_ratio = BucketOperationHelper.base_bucket_ratio( self.servers) info = rest.get_nodes_self() available_ram = info.memoryQuota * node_ram_ratio rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram), replicaNumber=replica) ready = BucketOperationHelper.wait_for_memcached(master, name) self.assertTrue(ready, msg="wait_for_memcached failed") self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
def test_online_swap_rebalance_upgrade(self): """ Online swap rebalance upgrade test The old nodes are removed and the new nodes are added followed by a rebalance. """ # Installs the `self.initial_version` of Couchbase on the first two servers self.product = 'couchbase-server' self._install(self.input.servers[:2]) # Check Couchbase is running post installation for server in self.input.servers: self.assertTrue(RestHelper(RestConnection(server)).is_ns_server_running(60), f"ns_server is not running on {server}") # Install the `self.upgrade_versions` on the last 2 nodes self.initial_version = self.upgrade_versions[0] self._install(self.input.servers[2:]) # Remove the first two nodes and perform a rebalance self.cluster.rebalance(self.servers, self.servers[2:], self.servers[:2], services=["kv", "kv"]) # Replace the services of the last node with kv and backup self.replace_services(self.servers[2:], self.servers[-1], ["kv,backup"]) # Add the built in user for memcached authentication self.add_built_in_server_user(node=self.servers[2]) # Create the default bucket and update the list of buckets rest_conn = RestConnection(self.servers[2]) rest_conn.create_bucket(bucket='default', ramQuotaMB=512, compressionMode=self.compression_mode) self.buckets = rest_conn.get_buckets() # Populate the buckets with data self._load_all_buckets(self.servers[2], BlobGenerator("ent-backup", "ent-backup-", self.value_size, end=self.num_items), "create", 0) try: backup_service_hook = BackupServiceHook(self.servers[-1], self.servers, self.backupset, None) # Wait for the data to be persisted to disk for bucket in self.buckets: if not RebalanceHelper.wait_for_stats_on_all(backup_service_hook.backup_service.master, bucket.name, 'ep_queue_size', 0, timeout_in_seconds=200): self.fail("Timeout reached while waiting for 'eq_queue_size' to reach 0") backup_service_hook.run_test() finally: backup_service_hook.cleanup()
def replication_verification(master, bucket_data, replica, test, failed_over=False): asserts = [] rest = RestConnection(master) buckets = rest.get_buckets() nodes = rest.node_statuses() test.log.info("expect {0} / {1} replication ? {2}".format(len(nodes), (1.0 + replica), len(nodes) / (1.0 + replica))) for bucket in buckets: ClusterOperationHelper.flushctl_set(master, "exp_pager_stime", 30, bucket.name) if len(nodes) / (1.0 + replica) >= 1: final_replication_state = RestHelper(rest).wait_for_replication(300) msg = "replication state after waiting for up to 5 minutes : {0}" test.log.info(msg.format(final_replication_state)) #run expiry_pager on all nodes before doing the replication verification for bucket in buckets: ClusterOperationHelper.flushctl_set(master, "exp_pager_stime", 30, bucket.name) test.log.info("wait for expiry pager to run on all these nodes") time.sleep(30) ClusterOperationHelper.flushctl_set(master, "exp_pager_stime", 3600, bucket.name) ClusterOperationHelper.flushctl_set(master, "exp_pager_stime", 30, bucket.name) # windows need more than 15 minutes to get number matched replica_match = RebalanceHelper.wait_till_total_numbers_match(bucket=bucket.name, master=master, timeout_in_seconds=600) if not replica_match: asserts.append("replication was completed but sum(curr_items) don't match the curr_items_total %s" % bucket.name) if not failed_over: stats = rest.get_bucket_stats(bucket=bucket.name) RebalanceHelper.print_taps_from_all_nodes(rest, bucket.name) msg = "curr_items : {0} is not equal to actual # of keys inserted : {1} : bucket: {2}" if bucket_data[bucket.name]['kv_store'] is None: items_inserted = bucket_data[bucket.name]["items_inserted_count"] else: items_inserted = len(bucket_data[bucket.name]['kv_store'].valid_items()) active_items_match = stats["curr_items"] == items_inserted if not active_items_match: asserts.append(msg.format(stats["curr_items"], items_inserted, bucket.name)) if len(asserts) > 0: for msg in asserts: test.log.error(msg) test.assertTrue(len(asserts) == 0, msg=asserts)
def cleanup_cluster(servers, wait_for_rebalance=True): log = logger.Logger.get_logger() rest = RestConnection(servers[0]) helper = RestHelper(rest) helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \ [node.id for node in nodes if node.id != master_id]) removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in nodes if node.id != master_id], wait_for_rebalance=wait_for_rebalance) success_cleaned = [] for removed in [node for node in nodes if (node.id != master_id)]: removed.rest_password = servers[0].rest_password removed.rest_username = servers[0].rest_username try: rest = RestConnection(removed) except Exception as ex: log.error("can't create rest connection after rebalance out for ejected nodes,\ will retry after 10 seconds according to MB-8430: {0} ".format(ex)) time.sleep(10) rest = RestConnection(removed) start = time.time() while time.time() - start < 30: if len(rest.get_pools_info()["pools"]) == 0: success_cleaned.append(removed) break else: time.sleep(0.1) if time.time() - start > 10: log.error("'pools' on node {0}:{1} - {2}".format( removed.ip, removed.port, rest.get_pools_info()["pools"])) for node in set([node for node in nodes if (node.id != master_id)]) - set(success_cleaned): log.error("node {0}:{1} was not cleaned after removing from cluster".format( removed.ip, removed.port)) if len(set([node for node in nodes if (node.id != master_id)])\ - set(success_cleaned)) != 0: raise Exception("not all ejected nodes were cleaned successfully") log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \ [(node.id, node.port) for node in nodes if (node.id != master_id)]))
def test_stream_after_n_crashes(self): crashes = self.input.param("crash_num", 5) vbucket = randint(0, self.vbuckets) bucket = self.bucket_util.buckets[0] self.log.info("Chosen vbucket {0} for {1} crashes".format( vbucket, crashes)) start = 0 end = self.num_items nodeA = self.cluster.servers[0] shell_conn = RemoteMachineShellConnection(nodeA) cb_stat_obj = Cbstats(shell_conn) rest = RestHelper(RestConnection(nodeA)) for _ in xrange(crashes): # Load data into the selected vbucket self.load_docs(bucket, vbucket, start, end, "create") self.assertTrue(self.stop_node(0), msg="Failed during stop_node") self.sleep(5, "Sleep after stop_node") self.assertTrue(self.start_node(0), msg="Failed during start_node") self.assertTrue(rest.is_ns_server_running(), msg="Failed while is_ns_server_running check") self.sleep(5, "Waiting after ns_server started") # Fetch vbucket seqno stats vb_stat = cb_stat_obj.vbucket_seqno(bucket.name) dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, vb_stat[vbucket]["high_seqno"], vb_stat[vbucket]["uuid"]) stream.run() self.assertTrue( stream.last_by_seqno == vb_stat[vbucket]["high_seqno"], msg="Mismatch in high_seqno. {0} == {1}".format( vb_stat[vbucket]["high_seqno"], stream.last_by_seqno)) # Update start/end values for next loop start = end end += self.num_items # Disconnect shell Connection for the node shell_conn.disconnect()
def verify_upgrade_rebalance_in_out(self): self.master = self.servers[self.initial_num_servers] self.rest = RestConnection(self.master) self.rest_helper = RestHelper(self.rest) for bucket in self.buckets: if self.rest_helper.bucket_exists(bucket.name): continue else: raise Exception("bucket:- %s not found" % bucket.name) if self.op_types == "bucket": bucketinfo = self.rest.get_bucket(bucket.name) self.log.info("bucket info :- %s" % bucketinfo) if self.op_types == "data": self._wait_for_stats_all_buckets( self.servers[self.initial_num_servers:self.num_servers]) self._verify_all_buckets(self.master, 1, self.wait_timeout * 50, self.max_verify, True, 1) self._verify_stats_all_buckets( self.servers[self.initial_num_servers:self.num_servers])
def _modify_bucket(self): helper = RestHelper(self.rest) node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers) info = self.rest.get_nodes_self() status, content = self.rest.change_bucket_props( bucket=self.bucket, ramQuotaMB=512, authType='sasl', timeSynchronization='enabledWithOutDrift') if re.search('TimeSyncronization not allowed in update bucket', content): self.log.info( '[PASS]Expected modify bucket to disallow Time Synchronization.' ) else: self.fail( '[ERROR] Not expected to allow modify bucket for Time Synchronization' )
def cleanup_cluster(servers, wait_for_rebalance=True): log = logger.Logger.get_logger() rest = RestConnection(servers[0]) helper = RestHelper(rest) helper.is_ns_server_running( timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT) nodes = rest.node_statuses() master_id = rest.get_nodes_self().id if len(nodes) > 1: log.info("rebalancing all nodes in order to remove nodes") removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[ node.id for node in nodes if node.id != master_id ], wait_for_rebalance=wait_for_rebalance) log.info( "removed all the nodes from cluster associated with {0} ? {1}". format(servers[0], removed))
def test_win_specific_names(self): version = self._get_cb_version() if self._get_cb_os() != 'windows': self.log.warn('This test is windows specific') return try: self.test_banned_bucket_name() finally: try: self.log.info('Will check if ns_server is running') rest = RestConnection(self.master) self.assertTrue( RestHelper(rest).is_ns_server_running( timeout_in_seconds=60)) except: self._reinstall(version) self.fail( "ns_server is not running after bucket '%s' creation" % (self.bucket_name))
def remove_node(self,otpnode=None, wait_for_rebalance=True): nodes = self.rest.node_statuses() '''This is the case when master node is running cbas service as well''' if len(nodes) <= len(otpnode): return helper = RestHelper(self.rest) try: removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance) except Exception as e: self.log.info("First time rebalance failed on Removal. Wait and try again. THIS IS A BUG.") time.sleep(5) removed = helper.remove_nodes(knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance) if wait_for_rebalance: removed
def test_crash_entire_cluster(self): self.cluster.rebalance([self.master], self.servers[1:], []) bucket = self.bucket_util.buckets[0] vbucket = randint(0, self.vbuckets) nodeA = self.servers[0] self.load_docs(bucket, vbucket, 0, self.num_items, "create") shell_conn = RemoteMachineShellConnection(nodeA) cb_stat_obj = Cbstats(shell_conn) dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER) _ = dcp_client.stream_req(vbucket, 0, 0, 2 * self.num_items, 0) self.load_docs(nodeA, vbucket, self.num_items) # stop all nodes node_range = range(len(self.servers)) for i in node_range: self.assertTrue(self.stop_node(i), msg="Failed during stoip_node") self.sleep(2, "Wait after stop_node") # start all nodes in reverse order node_range.reverse() for i in node_range: self.assertTrue(self.start_node(i), msg="Failed during start_node") rest = RestHelper(RestConnection(nodeA)) self.assertTrue(rest.is_ns_server_running(), msg="Failed while is_ns_server_running check") vb_info = cb_stat_obj.vbucket_seqno(bucket.name) dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER) stream = dcp_client.stream_req(vbucket, 0, 0, vb_info[vbucket]["high_seqno"], 0) stream.run() self.assertTrue(stream.last_by_seqno == vb_info[vbucket]["high_seqno"], msg="Seq-no mismatch. {0} != {1}".format( stream.last_by_seqno, vb_info[vbucket]["high_seqno"])) # Disconnect shell Connection for the node shell_conn.disconnect()
def test_failover_transaction(self): query_node = self.servers[1] sleep_time_ms = 10000 threads = [None] * self.thread_count results = [None] * self.thread_count # Start a transaction begin_work = self.run_cbq_query(query="BEGIN WORK", server=query_node, txtimeout="2m") txid = begin_work['results'][0]['txid'] # Launch query thread/s (should be single) select_statement = f"select {sleep_time_ms}" for i in range(len(threads)): self.log.info(f"Lauching query thread {i}") threads[i] = threading.Thread(target=self.run_query, args=(select_statement, query_node, results, i, 60, txid)) threads[i].start() # Perform failover or removal of query node self.sleep(2) if self.action == 'failover': failover = self.cluster.failover(servers=self.servers, failover_nodes=[query_node], graceful=self.graceful) elif self.action == 'remove': rebalance = self.cluster.async_rebalance(servers=self.servers, to_add=[], to_remove=[query_node]) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() # Check query thread/s completed successfuly for i in range(len(threads)): threads[i].join() self.log.info(results) for i in range(len(threads)): self.assertEqual(results[i], [{'$1': sleep_time_ms}])
def _create_bucket(self, lww=True, drift=False, name=None): if lww: self.lww=lww if name: self.bucket=name helper = RestHelper(self.rest) if not helper.bucket_exists(self.bucket): node_ram_ratio = BucketOperationHelper.base_bucket_ratio( self.servers) info = self.rest.get_nodes_self() self.rest.create_bucket(bucket=self.bucket, ramQuotaMB=512, authType='sasl', lww=self.lww) try: ready = BucketOperationHelper.wait_for_memcached(self.master, self.bucket) except Exception as e: self.fail('unable to create bucket')
def test_rebalance_in_query_node(self): self.run_cbq_query( query="PREPARE p1 from select * from default limit 5", server=self.servers[0]) self.sleep(5) for i in range(self.nodes_init): self.run_cbq_query(query="execute p1", server=self.servers[i]) services_in = ["n1ql", "index", "data"] rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [self.servers[self.nodes_init + 1]], [], services=services_in) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") rebalance.result() self.sleep(30) for i in range(self.nodes_init + 2): self.run_cbq_query(query="execute '[%s:%s]p1'" % (self.servers[0].ip, self.servers[0].port), server=self.servers[i])
def add_node_and_rebalance(self, master, servers): ClusterOperationHelper.add_all_nodes_or_assert(master, servers, self.input.membase_settings, self) rest = RestConnection(master) nodes = rest.node_statuses() otpNodeIds = [] for node in nodes: otpNodeIds.append(node.id) rebalanceStarted = rest.rebalance(otpNodeIds, []) self.assertTrue(rebalanceStarted, "unable to start rebalance on master node {0}".format(master.ip)) self.log.info('started rebalance operation on master node {0}'.format(master.ip)) rebalanceSucceeded = rest.monitorRebalance() self.assertTrue(rebalanceSucceeded, "rebalance operation for nodes: {0} was not successful".format(otpNodeIds)) self.log.info('rebalance operaton succeeded for nodes: {0}'.format(otpNodeIds)) #now remove the nodes #make sure its rebalanced and node statuses are healthy helper = RestHelper(rest) self.assertTrue(helper.is_cluster_healthy, "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced, "cluster is not balanced")
def failover(self, howmany): #chekck if all nodes are part of the cluster rest = RestConnection(self.servers[0]) nodes = rest.node_statuses() if len(nodes) != len(self.servers): self.test.fail( num_nodes_mismatch.format(len(self.servers), len(nodes))) if len(nodes) - howmany < 2: self.test.fail(num_nodes_mismatch.format(len(nodes), howmany)) master_node = rest.get_nodes_self() #when selecting make sure we dont pick the master node selection = [n for n in nodes if n.id != master_node.id] shuffle(selection) failed = selection[0:howmany] for f in failed: self.log.info("will fail over node : {0}".format(f.id)) if len(nodes) // (1 + howmany) >= 1: self.test.assertTrue( RebalanceHelper.wait_for_replication(rest.get_nodes(), timeout=900), msg="replication did not finish after 15 minutes") for f in failed: self._stop_server(f) self.log.info( "10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down for f in failed: if f.port == 8091: self.test.assertTrue( RestHelper(rest).wait_for_node_status( f, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) self.test.assertTrue(rest.fail_over(f.id), msg="failover did not complete") self.log.info("failed over node : {0}".format(f.id)) return failed
def _common_test_body(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings bucket_data = RebalanceBaseTest.bucket_data_init(rest) ClusterHelper.add_all_nodes_or_assert(master, self.servers, creds, self) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding nodes") nodes = rest.node_statuses() #dont rebalance out the current node while len(nodes) > 1: #pick a node that is not the master node toBeEjectedNode = RebalanceHelper.pick_node(master) distribution = RebalanceBaseTest.get_distribution(self.load_ratio) RebalanceBaseTest.load_data_for_buckets(rest, self.load_ratio, distribution, [master], bucket_data, self) self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()])) #let's start/step rebalance three times self.log.info("removing node {0} and rebalance afterwards".format(toBeEjectedNode.id)) rest.fail_over(toBeEjectedNode.id) self.log.info("failed over {0}".format(toBeEjectedNode.id)) time.sleep(10) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[toBeEjectedNode.id]) expected_progress = 30 reached = RestHelper(rest).rebalance_reached(expected_progress) self.assertTrue(reached, "rebalance failed or did not reach {0}%".format(expected_progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") time.sleep(20) RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[toBeEjectedNode.id]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNode.id)) time.sleep(20) RebalanceBaseTest.replication_verification(master, bucket_data, self.replica, self) nodes = rest.node_statuses()
def common_setup(self, replica): self._input = TestInputSingleton.input self._servers = self._input.servers first = self._servers[0] self.log = logger.Logger().get_logger() self.log.info(self._input) rest = RestConnection(first) for server in self._servers: RestHelper(RestConnection(server)).is_ns_server_running() ClusterOperationHelper.cleanup_cluster(self._servers) BucketOperationHelper.delete_all_buckets_or_assert(self._servers, self) ClusterOperationHelper.add_all_nodes_or_assert( self._servers[0], self._servers, self._input.membase_settings, self) nodes = rest.node_statuses() otpNodeIds = [] for node in nodes: otpNodeIds.append(node.id) info = rest.get_nodes_self() bucket_ram = info.mcdMemoryReserved * 3 / 4 rest.create_bucket(bucket="default", ramQuotaMB=int(bucket_ram), replicaNumber=replica, proxyPort=rest.get_nodes_self().moxi) msg = "wait_for_memcached fails" ready = BucketOperationHelper.wait_for_memcached(first, "default"), self.assertTrue(ready, msg) rebalanceStarted = rest.rebalance(otpNodeIds, []) self.assertTrue( rebalanceStarted, "unable to start rebalance on master node {0}".format(first.ip)) self.log.info('started rebalance operation on master node {0}'.format( first.ip)) rebalanceSucceeded = rest.monitorRebalance() # without a bucket this seems to fail self.assertTrue( rebalanceSucceeded, "rebalance operation for nodes: {0} was not successful".format( otpNodeIds)) self.awareness = VBucketAwareMemcached(rest, "default")
def load_some_data(serverInfo, fill_ram_percentage=10.0, bucket_name='default'): log = logger.Logger.get_logger() if fill_ram_percentage <= 0.0: fill_ram_percentage = 5.0 client = MemcachedClientHelper.direct_client(serverInfo, bucket_name) #populate key rest = RestConnection(serverInfo) RestHelper(rest).vbucket_map_ready(bucket_name, 60) vbucket_count = len(rest.get_vbuckets(bucket_name)) testuuid = uuid.uuid4() info = rest.get_bucket(bucket_name) emptySpace = info.stats.ram - info.stats.memUsed log.info('emptySpace : {0} fill_ram_percentage : {1}'.format(emptySpace, fill_ram_percentage)) fill_space = (emptySpace * fill_ram_percentage) / 100.0 log.info("fill_space {0}".format(fill_space)) # each packet can be 10 KB packetSize = int(10 * 1024) number_of_buckets = int(fill_space) / packetSize log.info('packetSize: {0}'.format(packetSize)) log.info('memory usage before key insertion : {0}'.format(info.stats.memUsed)) log.info('inserting {0} new keys to memcached @ {0}'.format(number_of_buckets, serverInfo.ip)) keys = ["key_%s_%d" % (testuuid, i) for i in range(number_of_buckets)] inserted_keys = [] for key in keys: vbucketId = crc32.crc32_hash(key) & (vbucket_count - 1) client.vbucketId = vbucketId try: client.set(key, 0, 0, key) inserted_keys.append(key) except mc_bin_client.MemcachedError as error: log.error(error) client.close() log.error("unable to push key : {0} to vbucket : {1}".format(key, client.vbucketId)) if test: test.fail("unable to push key : {0} to vbucket : {1}".format(key, client.vbucketId)) else: break client.close() return inserted_keys
def test_node_memcached_failure(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._pause_couchbase(self.server_fail) self.sleep(5) AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) RemoteUtilHelper.common_basic_setup([self.server_fail]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_setting_propogation_rebalance_in(self): expected_curl = self.set_tmpspace() self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size) expected_dir = self.set_directory() self.assertEqual(expected_dir['queryTmpSpaceDir'], self.directory_path) services_in = ["n1ql", "index", "data"] rebalance = self.cluster.async_rebalance( self.servers[:self.nodes_init], [self.servers[self.nodes_init]], [], services=services_in) reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed, stuck or did not complete") self.sleep(1) curl_url = "http://%s:%s/settings/querySettings" % (self.servers[ self.nodes_init].ip, self.servers[self.nodes_init].port) curl_output = self.shell.execute_command( "%s -u Administrator:password %s" % (self.curl_path, curl_url)) expected_curl = self.convert_list_to_json(curl_output[0]) self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size) self.assertEqual(expected_curl['queryTmpSpaceDir'], self.directory_path)
def rebalance_in_with_cluster_password_change(self): new_password = self.input.param("new_password", "new_pass") servs_result = self.servers[:self.nodes_init + self.nodes_in] rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], self.servers[self.nodes_init:self.nodes_init + self.nodes_in], []) old_pass = self.master.rest_password self.sleep(10, "Wait for rebalance have some progress") self.change_password(new_password=new_password) try: rebalance.result() self.log.exception("rebalance should be failed when password is changing") self.verify_unacked_bytes_all_buckets() except Exception as ex: self.sleep(10, "wait for rebalance failed") rest = RestConnection(self.master) self.log.info("Latest logs from UI:") for i in rest.get_logs(): self.log.error(i) self.assertFalse(RestHelper(rest).is_cluster_rebalanced()) finally: self.change_password(new_password=old_pass)