def common_tearDown(servers, testcase): log = logger.Logger.get_logger() log.info( "============== common_tearDown was started for test #{0} {1} ==============".format( testcase.case_number, testcase._testMethodName ) ) RemoteUtilHelper.common_basic_setup(servers) log.info("10 seconds delay to wait for couchbase-server to start") time.sleep(10) ClusterOperationHelper.wait_for_ns_servers_or_assert( servers, testcase, wait_time=AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME * 15, wait_if_warmup=True ) try: rest = RestConnection(self._servers[0]) buckets = rest.get_buckets() for bucket in buckets: MemcachedClientHelper.flush_bucket(servers[0], bucket.name) except Exception: pass BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) log.info( "============== common_tearDown was finished for test #{0} {1} ==============".format( testcase.case_number, testcase._testMethodName ) )
def tearDown(self): try: self._cluster_helper.shutdown() log = logger.Logger.get_logger() log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self._servers) log.info("10 seconds delay to wait for membase-server to start") time.sleep(10) for server in self._cleanup_nodes: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("iptables -F") shell.log_command_output(o, r) o, r = shell.execute_command("/sbin/iptables -A INPUT -p tcp -i eth0 --dport 1000:60000 -j ACCEPT") shell.log_command_output(o, r) o, r = shell.execute_command("/sbin/iptables -A OUTPUT -p tcp -o eth0 --dport 1000:60000 -j ACCEPT") shell.log_command_output(o, r) o, r = shell.execute_command("/etc/init.d/couchbase-server start") shell.log_command_output(o, r) shell.disconnect() BucketOperationHelper.delete_all_buckets_or_assert(self._servers, self) ClusterOperationHelper.cleanup_cluster(self._servers) ClusterHelper.wait_for_ns_servers_or_assert(self._servers, self) log.info("============== tearDown was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) finally: pass
def common_setup(input, testcase): servers = input.servers RemoteUtilHelper.common_basic_setup(servers) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) for server in servers: ClusterOperationHelper.cleanup_cluster([server]) ClusterHelper.wait_for_ns_servers_or_assert(servers, testcase)
def tearDown(self): if hasattr(self, '_resultForDoCleanups') and len(self._resultForDoCleanups.failures) > 0 \ and 'stop-on-failure' in TestInputSingleton.input.test_params and \ str(TestInputSingleton.input.test_params['stop-on-failure']).lower() == 'true': # supported starting with python2.7 log.warn("CLEANUP WAS SKIPPED") self.cluster.shutdown(force=True) self._log_finish(self) else: try: self.log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self.servers) BucketOperationHelper.delete_all_buckets_or_assert(self.servers, self) for node in self.servers: master = node try: ClusterOperationHelper.cleanup_cluster(self.servers, master=master) except: continue self.log.info("============== tearDown was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) finally: super(FailoverBaseTest, self).tearDown()
def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover for node in chosen: if failover_reason == 'stop_server': self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # define precondition check for failover failed_over = self.rest.fail_over(node.id, graceful=self.graceful) # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and failed_over: """ Check if the fail_over fails as expected """ self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=self.graceful) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")
def test_node_firewall_enabled(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) RemoteUtilHelper.enable_firewall(self.server_fail) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_upgrade_negative(self): op = self.input.param("op", None) error = self.input.param("error", '') remote = RemoteMachineShellConnection(self.master) if op is None: self.fail("operation should be specified") if op == "higher_version": tmp = self.initial_version self.initial_version = self.upgrade_versions[0] self.upgrade_versions = [tmp, ] info = None if op == "wrong_arch": info = remote.extract_remote_info() info.architecture_type = ('x86_64', 'x86')[info.architecture_type == 'x86'] self._install([self.master]) self.operations([self.master]) try: if op == "close_port": RemoteUtilHelper.enable_firewall(self.master) for upgrade_version in self.upgrade_versions: self.sleep(self.sleep_time, "Pre-setup of old version is done. Wait for upgrade to {0} version".\ format(upgrade_version)) output, error = self._upgrade(upgrade_version, self.master, info=info) if str(output).find(error) != -1 or str(error).find(error) != -1: raise Exception(error) except Exception, ex: self.log.info("Exception %s appeared as expected" % ex) self.log.info("Check that old version is working fine") self.verification([self.master])
def common_setup(input, testcase): log.info("============== common_setup was started for test #{0} {1}==============" \ .format(testcase.case_number, testcase._testMethodName)) servers = input.servers RemoteUtilHelper.common_basic_setup(servers) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) # Add built-in user testuser = [{ 'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******' }] RbacBase().create_user_source(testuser, 'builtin', servers[0]) # Assign user to role role_list = [{ 'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin' }] RbacBase().add_user_role(role_list, RestConnection(servers[0]), 'builtin') log.info("============== common_setup was finished for test #{0} {1} ==============" \ .format(testcase.case_number, testcase._testMethodName))
def tearDown(self): if hasattr(self, '_resultForDoCleanups') and len(self._resultForDoCleanups.failures) > 0 \ and 'stop-on-failure' in TestInputSingleton.input.test_params and \ str(TestInputSingleton.input.test_params['stop-on-failure']).lower() == 'true': #supported starting with python2.7 log.warn("CLEANUP WAS SKIPPED") self.cluster.shutdown() self._log_finish(self) else: try: self.log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self.servers) self.log.info("10 seconds delay to wait for membase-server to start") time.sleep(10) for server in self._cleanup_nodes: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("iptables -F") shell.log_command_output(o, r) o, r = shell.execute_command("/sbin/iptables -A INPUT -p tcp -i eth0 --dport 1000:60000 -j ACCEPT") shell.log_command_output(o, r) o, r = shell.execute_command("/sbin/iptables -A OUTPUT -p tcp -o eth0 --dport 1000:60000 -j ACCEPT") shell.log_command_output(o, r) o, r = shell.execute_command("/etc/init.d/couchbase-server start") shell.log_command_output(o, r) shell.disconnect() self.log.info("============== tearDown was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) finally: super(FailoverBaseTest, self).tearDown()
def tearDown(self): if hasattr(self, '_resultForDoCleanups') and len(self._resultForDoCleanups.failures) > 0 \ and 'stop-on-failure' in TestInputSingleton.input.test_params and \ str(TestInputSingleton.input.test_params['stop-on-failure']).lower() == 'true': # supported starting with python2.7 log.warn("CLEANUP WAS SKIPPED") self.cluster.shutdown(force=True) self._log_finish(self) else: try: self.log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self.servers) BucketOperationHelper.delete_all_buckets_or_assert( self.servers, self) for node in self.servers: master = node try: ClusterOperationHelper.cleanup_cluster(self.servers, master=master) except: continue self.log.info("============== tearDown was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) finally: super(FailoverBaseTest, self).tearDown()
def tearDown(self): try: self._cluster_helper.shutdown() log = logger.Logger.get_logger() log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self._servers) log.info("10 seconds delay to wait for membase-server to start") time.sleep(10) for server in self._cleanup_nodes: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("iptables -F") shell.log_command_output(o, r) o, r = shell.execute_command( "/sbin/iptables -A INPUT -p tcp -i eth0 --dport 1000:60000 -j ACCEPT" ) shell.log_command_output(o, r) o, r = shell.execute_command( "/sbin/iptables -A OUTPUT -p tcp -o eth0 --dport 1000:60000 -j ACCEPT" ) shell.log_command_output(o, r) o, r = shell.execute_command( "/etc/init.d/couchbase-server start") shell.log_command_output(o, r) shell.disconnect() BucketOperationHelper.delete_all_buckets_or_assert( self._servers, self) ClusterOperationHelper.cleanup_cluster(self._servers) ClusterHelper.wait_for_ns_servers_or_assert(self._servers, self) log.info("============== tearDown was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) finally: pass
def test_topology_change_events(self): available_server_before_rebalance = copy.deepcopy(self.available_servers) try: self.log.info("Enabling firewall between Incoming node and CBAS CC " "node to trigger topology_change_failed event") for node in available_server_before_rebalance: RemoteUtilHelper.enable_firewall( node, bidirectional=False, xdcr=False, action_on_packet="REJECT", block_ips=[self.cluster.cbas_cc_node.ip], all_interface=True) self.log.info("Rebalancing IN CBAS node to trigger " "topology_change_started event") rebalance_task, self.available_servers = self.rebalance_util.rebalance( self.cluster, kv_nodes_in=0, kv_nodes_out=0, cbas_nodes_in=1, cbas_nodes_out=0, available_servers=self.available_servers, exclude_nodes=[]) if self.rebalance_util.wait_for_rebalance_task_to_complete( rebalance_task, self.cluster, check_cbas_running=False): raise Exception("Rebalance passed when it should have failed.") self.log.info("Disabling firewall between Incoming node and CBAS CC " "node and retriggering rebalance to trigger " "topology_change_completed event") for node in available_server_before_rebalance: remote_client = RemoteMachineShellConnection(node) remote_client.disable_firewall() remote_client.disconnect() rebalance_task, self.available_servers = self.rebalance_util.rebalance( self.cluster, kv_nodes_in=0, kv_nodes_out=0, cbas_nodes_in=0, cbas_nodes_out=0, available_servers=self.available_servers, exclude_nodes=[]) if not self.rebalance_util.wait_for_rebalance_task_to_complete( rebalance_task, self.cluster, check_cbas_running=False): raise Exception("Rebalance failed even after disabling " "firewall") self.log.info("Adding event for topology_change_started event") self.system_events.add_event(AnalyticsEvents.topology_change_started( self.cluster.cbas_cc_node.ip, 2, 0)) self.log.info("Adding event for topology_change_failed event") self.system_events.add_event(AnalyticsEvents.topology_change_failed( self.cluster.cbas_cc_node.ip, 2, 0)) self.log.info("Adding event for topology_change_completed event") self.system_events.add_event(AnalyticsEvents.topology_change_completed( self.cluster.cbas_cc_node.ip, 2, 0)) except Exception as err: self.log.info("Disabling Firewall") for node in available_server_before_rebalance: remote_client = RemoteMachineShellConnection(node) remote_client.disable_firewall() remote_client.disconnect() self.fail(str(err))
def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers,chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
def test_60s_timeout_firewall(self): timeout = self.timeout server_fail = self._servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)
def common_tearDown(servers, testcase): RemoteUtilHelper.common_basic_setup(servers) log = logger.Logger.get_logger() log.info("10 seconds delay to wait for membase-server to start") time.sleep(10) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterHelper.wait_for_ns_servers_or_assert(servers, testcase)
def common_setup(input, testcase): log.info("============== common_setup was started for test #{0} {1}=============="\ .format(testcase.case_number, testcase._testMethodName)) servers = input.servers RemoteUtilHelper.common_basic_setup(servers) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) log.info("============== common_setup was finished for test #{0} {1} =============="\ .format(testcase.case_number, testcase._testMethodName))
def common_setup(input, testcase): log.info("============== common_setup was started for test #{0} {1}=============="\ .format(testcase.case_number, testcase._testMethodName)) servers = input.servers RemoteUtilHelper.common_basic_setup(servers) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) log.info("============== common_setup was finished for test #{0} {1} =============="\ .format(testcase.case_number, testcase._testMethodName))
def post_init_cb(self): # Optionally change node name and restart server if params.get('use_domain_names', False): RemoteUtilHelper.use_hostname_for_server_settings(self.node) # Optionally disable consistency check if params.get('disable_consistency', False): self.rest.set_couchdb_option(section='couchdb', option='consistency_check_ratio', value='0.0')
def test_60s_timeout_firewall(self): timeout = self.timeout server_fail = self.servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings! See MB-7282') self.sleep(5) RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)
def test_firewall_node_when_autoreprovisioning(self): wait_timeout = 120 before = self.input.param("before", True) timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") # self.sleep(5) if before: RemoteUtilHelper.enable_firewall(self.servers[2]) self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) if not before: RemoteUtilHelper.enable_firewall(self.servers[2]) # self.sleep(5) try: self.rest.monitorRebalance() self.fail("Rebalance failed expected") except RebalanceFailedException: self.log.info("Rebalance failed but it's expected") shell = RemoteMachineShellConnection(self.servers[2]) shell.disable_firewall() self.sleep(5) self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def common_tearDown(servers, testcase): RemoteUtilHelper.common_basic_setup(servers) log = logger.Logger.get_logger() log.info("10 seconds delay to wait for couchbase-server to start") time.sleep(10) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) try: MemcachedClientHelper.flush_bucket(servers[0], 'default') except Exception: pass BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase)
def setUp(self): log = logger.Logger.get_logger() self._input = TestInputSingleton.input self._keys_count = self._input.param("keys_count", DEFAULT_KEY_COUNT) self._num_replicas = self._input.param("replica", DEFAULT_REPLICA) self.bidirectional = self._input.param("bidirectional", False) self.case_number = self._input.param("case_number", 0) self._value_size = self._input.param("value_size", 256) self.wait_timeout = self._input.param("wait_timeout", 60) self._servers = self._input.servers self.master = self._servers[0] self._failed_nodes = [] num_buckets = 0 self.buckets = [] self.default_bucket = self._input.param("default_bucket", True) if self.default_bucket: self.default_bucket_name = "default" num_buckets += 1 self._standard_buckets = self._input.param("standard_buckets", 0) self._sasl_buckets = self._input.param("sasl_buckets", 0) num_buckets += self._standard_buckets + self._sasl_buckets self.dgm_run = self._input.param("dgm_run", True) self.log = logger.Logger().get_logger() self._cluster_helper = Cluster() self.disabled_consistent_view = self._input.param( "disabled_consistent_view", None) self._quota = self._initialize_nodes(self._cluster_helper, self._servers, self.disabled_consistent_view) if self.dgm_run: self.quota = 256 self.bucket_size = int( (2.0 / 3.0) / float(num_buckets) * float(self._quota)) self.gen_create = BlobGenerator('loadOne', 'loadOne_', self._value_size, end=self._keys_count) self.add_back_flag = False self._cleanup_nodes = [] log.info("============== setup was started for test #{0} {1}=============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self._servers) BucketOperationHelper.delete_all_buckets_or_assert(self._servers, self) for server in self._servers: ClusterOperationHelper.cleanup_cluster([server]) ClusterHelper.wait_for_ns_servers_or_assert(self._servers, self) self._setup_cluster() self._create_buckets_() log.info("============== setup was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName))
def test_60s_timeout_firewall(self): # AUTOFAIL_TEST_5 timeout = self.timeout server_fail = self._servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings!') time.sleep(5) time_start = time.time() RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout, self) time_end = time.time() msg = "{0} != {1}".format(time_end - time_start, timeout) self.assertTrue(abs((time_end - time_start) - timeout) <= AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, msg) self.log.info("expected failover in {0} seconds, actual time {1} seconds".format(timeout, time_end - time_start))
def test_60s_timeout_firewall(self): # AUTOFAIL_TEST_5 timeout = self.timeout server_fail = self._servers[1] status = self.rest.update_autofailover_settings(True, timeout) if not status: self.fail('failed to change autofailover_settings!') self.sleep(5) time_start = time.time() RemoteUtilHelper.enable_firewall(server_fail) AutoFailoverBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + self.extra_timeout, self) time_end = time.time() msg = "{0} != {1}".format(time_end - time_start, timeout + self.extra_timeout) self.assertTrue(abs((time_end - time_start) - timeout - self.extra_timeout) <= AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, msg) self.log.info("expected failover in {0} seconds, actual time {1} seconds".format(timeout, time_end - time_start))
def pass_encrypted_in_logs_test(self): self.bucket_size = self._get_bucket_size(self.quota, 1) self._create_sasl_buckets(self.master, 1, password='******') bucket = self.buckets[-1] if self.input.param("load", 0): self.num_items = self.input.param("load", 0) self._load_doc_data_all_buckets() if self.input.param("views", 0): views = [] for i in xrange(self.input.param("views", 0)): views.append(View("view_sasl" + str(i), 'function (doc, meta) {' 'emit(meta.id, "emitted_value%s");}' % str(i), None, False)) self.create_views(self.master, "ddoc", views, bucket) if self.input.param("rebalance", 0): self.cluster.rebalance(self.servers[:self.nodes_init], self.servers[self.nodes_init:self.nodes_init + self.input.param("rebalance", 0)], []) for server in self.servers[:self.nodes_init]: for log_file in ['debug', 'info', 'views', 'xdcr']: self.assertFalse(RemoteUtilHelper.is_text_present_in_logs(server, bucket.saslPassword, logs_to_check=log_file), "%s logs contains password in plain text" % log_file)
def common_tearDown(servers, testcase): RemoteUtilHelper.common_basic_setup(servers) log = logger.Logger.get_logger() log.info("10 seconds delay to wait for couchbase-server to start") time.sleep(10) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) try: rest = RestConnection(self._servers[0]) buckets = rest.get_buckets() for bucket in buckets: MemcachedClientHelper.flush_bucket(servers[0], bucket.name) except Exception: pass BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase)
def pass_encrypted_in_logs_test(self): self.bucket_size = self._get_bucket_size(self.quota, 1) self._create_sasl_buckets(self.master, 1, password='******') bucket = self.buckets[-1] if self.input.param("load", 0): self.num_items = self.input.param("load", 0) self._load_doc_data_all_buckets() if self.input.param("views", 0): views = [] for i in range(self.input.param("views", 0)): views.append(View("view_sasl" + str(i), 'function (doc, meta) {' 'emit(meta.id, "emitted_value%s");}' % str(i), None, False)) self.create_views(self.master, "ddoc", views, bucket) if self.input.param("rebalance", 0): self.cluster.rebalance(self.servers[:self.nodes_init], self.servers[self.nodes_init:self.nodes_init + self.input.param("rebalance", 0)], []) for server in self.servers[:self.nodes_init]: for log_file in ['debug', 'info', 'views', 'xdcr']: self.assertFalse(RemoteUtilHelper.is_text_present_in_logs(server, bucket.saslPassword, logs_to_check=log_file), "%s logs contains password in plain text" % log_file)
def tearDown(self): if hasattr(self, '_resultForDoCleanups') \ and len(self._resultForDoCleanups.failures) > 0 \ and 'stop-on-failure' in TestInputSingleton.input.test_params \ and str(TestInputSingleton.input.test_params['stop-on-failure']).lower() == 'true': # supported starting with python2.7 self.log.warn("CLEANUP WAS SKIPPED") self.cluster.shutdown(force=True) else: try: self.log.info("============== tearDown was started for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self.cluster.servers) self.cluster_util.check_for_panic_and_mini_dumps(self.servers) finally: super(FailoverBaseTest, self).tearDown()
def test_firewall_node_when_autoreprovisioning(self): wait_timeout = 120 before = self.input.param("before", True) timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") # self.sleep(5) if before: RemoteUtilHelper.enable_firewall(self.servers[2]) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) if not before: RemoteUtilHelper.enable_firewall(self.servers[2]) # self.sleep(5) try: self.rest.monitorRebalance() self.fail("Rebalance failed expected") except RebalanceFailedException: self.log.info("Rebalance failed but it's expected") shell = RemoteMachineShellConnection(self.servers[2]) shell.disable_firewall() self.sleep(5) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def setUp(self): log = logger.Logger.get_logger() self._input = TestInputSingleton.input self._keys_count = self._input.param("keys_count", DEFAULT_KEY_COUNT) self._num_replicas = self._input.param("replica", DEFAULT_REPLICA) self.bidirectional = self._input.param("bidirectional", False) self.case_number = self._input.param("case_number", 0) self._value_size = self._input.param("value_size", 256) self.wait_timeout = self._input.param("wait_timeout", 60) self._servers = self._input.servers self.master = self._servers[0] self._failed_nodes = [] num_buckets = 0 self.buckets = [] self.default_bucket = self._input.param("default_bucket", True) if self.default_bucket: self.default_bucket_name = "default" num_buckets += 1 self._standard_buckets = self._input.param("standard_buckets", 0) self._sasl_buckets = self._input.param("sasl_buckets", 0) num_buckets += self._standard_buckets + self._sasl_buckets self.dgm_run = self._input.param("dgm_run", True) self.log = logger.Logger().get_logger() self._cluster_helper = Cluster() self.disabled_consistent_view = self._input.param("disabled_consistent_view", None) self._quota = self._initialize_nodes(self._cluster_helper, self._servers, self.disabled_consistent_view) if self.dgm_run: self.quota = 256 self.bucket_size = int((2.0 / 3.0) / float(num_buckets) * float(self._quota)) self.gen_create = BlobGenerator('loadOne', 'loadOne_', self._value_size, end=self._keys_count) self.add_back_flag = False self._cleanup_nodes = [] log.info("============== setup was started for test #{0} {1}=============="\ .format(self.case_number, self._testMethodName)) RemoteUtilHelper.common_basic_setup(self._servers) BucketOperationHelper.delete_all_buckets_or_assert(self._servers, self) for server in self._servers: ClusterOperationHelper.cleanup_cluster([server]) ClusterHelper.wait_for_ns_servers_or_assert(self._servers, self) self._setup_cluster() self._create_buckets_() log.info("============== setup was finished for test #{0} {1} =============="\ .format(self.case_number, self._testMethodName))
def common_tearDown(servers, testcase): log.info("============== common_tearDown was started for test #{0} {1} =============="\ .format(testcase.case_number, testcase._testMethodName)) RemoteUtilHelper.common_basic_setup(servers) log.info("10 seconds delay to wait for couchbase-server to start") time.sleep(10) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase, \ wait_time=AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME * 10, wait_if_warmup=True) try: rest = RestConnection(servers[0]) buckets = rest.get_buckets() for bucket in buckets: MemcachedClientHelper.flush_bucket(servers[0], bucket.name) except Exception: pass BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) log.info("============== common_tearDown was finished for test #{0} {1} =============="\ .format(testcase.case_number, testcase._testMethodName))
def common_setup(input, testcase): log.info("============== common_setup was started for test #{0} {1}==============" \ .format(testcase.case_number, testcase._testMethodName)) servers = input.servers RemoteUtilHelper.common_basic_setup(servers) BucketOperationHelper.delete_all_buckets_or_assert(servers, testcase) ClusterOperationHelper.cleanup_cluster(servers) ClusterOperationHelper.wait_for_ns_servers_or_assert(servers, testcase) # Add built-in user testuser = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'password': '******'}] RbacBase().create_user_source(testuser, 'builtin', servers[0]) time.sleep(10) # Assign user to role role_list = [{'id': 'cbadminbucket', 'name': 'cbadminbucket', 'roles': 'admin'}] RbacBase().add_user_role(role_list, RestConnection(servers[0]), 'builtin') time.sleep(10) log.info("============== common_setup was finished for test #{0} {1} ==============" \ .format(testcase.case_number, testcase._testMethodName))
def test_node_memcached_failure(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._pause_couchbase(self.server_fail) self.sleep(5) AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) RemoteUtilHelper.common_basic_setup([self.server_fail]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_node_firewall_enabled(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) RemoteUtilHelper.enable_firewall(self.server_fail) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_cbcollect_with_redaction_enabled_with_xdcr(self): rest_src = RestConnection(self.master) rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() rest_dest = RestConnection(self.servers[1]) rest_dest_helper = RestHelper(rest_dest) try: rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() self.set_redaction_level() rest_src.add_remote_cluster(self.servers[1].ip, self.servers[1].port, self.servers[1].rest_username, self.servers[1].rest_password, "C2") """ at dest cluster """ self.add_built_in_server_user(node=self.servers[1]) rest_dest.create_bucket(bucket='default', ramQuotaMB=512) bucket_ready = rest_dest_helper.vbucket_map_ready('default') if not bucket_ready: self.fail( "Bucket default at dest not created after 120 seconds.") repl_id = rest_src.start_replication('continuous', 'default', "C2") if repl_id is not None: self.log.info("Replication created successfully") gen = BlobGenerator("ent-backup", "ent-backup-", self.value_size, end=self.num_items) tasks = self._async_load_all_buckets(self.master, gen, "create", 0) for task in tasks: task.result() self.sleep(10) """ enable firewall """ if self.interrupt_replication: RemoteUtilHelper.enable_firewall(self.master, xdcr=True) """ start collect logs """ self.start_logs_collection() result = self.monitor_logs_collection() """ verify logs """ try: logs_path = result["perNode"]["ns_1@" + str(self.master.ip)]["path"] except KeyError: logs_path = result["perNode"]["[email protected]"]["path"] redactFileName = logs_path.split('/')[-1] nonredactFileName = logs_path.split('/')[-1].replace( '-redacted', '') remotepath = logs_path[0:logs_path.rfind('/') + 1] self.verify_log_files_exist(remotepath=remotepath, redactFileName=redactFileName, nonredactFileName=nonredactFileName) self.log.info("Verify on log ns_server.goxdcr.log") self.verify_log_redaction(remotepath=remotepath, redactFileName=redactFileName, nonredactFileName=nonredactFileName, logFileName="ns_server.goxdcr.log") finally: """ clean up xdcr """ rest_dest.delete_bucket() rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() if self.interrupt_replication: shell = RemoteMachineShellConnection(self.master) shell.disable_firewall() shell.disconnect()
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall(self.servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("30 seconds sleep after failover before invoking rebalance...") time.sleep(30) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master)
def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers,chosen) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) )
def enable_firewall(server): """Enable firewall @param server: server object to enable firewall @param rep_direction: replication direction unidirection/bidirection """ RemoteUtilHelper.enable_firewall(server)
def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(120) success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) self.sleep(180) msg = "graceful failover failed for nodes {0}".format(node.id) self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen), self.get_failover_count())) self.assertEqual(len(chosen), self.get_failover_count(), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover))
def initialize(self, params): log.info('*****CouchbaseServerInstaller initialize the application ****') start_time = time.time() cluster_initialized = False server = params["server"] remote_client = RemoteMachineShellConnection(params["server"]) while time.time() < start_time + 5 * 60: try: rest = RestConnection(server) # Optionally change node name and restart server if params.get('use_domain_names', 0): RemoteUtilHelper.use_hostname_for_server_settings(server) # Make sure that data_path and index_path are writable by couchbase user for path in set(filter(None, [server.data_path, server.index_path])): time.sleep(3) for cmd in ("rm -rf {0}/*".format(path), "chown -R couchbase:couchbase {0}".format(path)): remote_client.execute_command(cmd) rest.set_data_path(data_path=server.data_path, index_path=server.index_path) time.sleep(3) # Initialize cluster if "init_nodes" in params: init_nodes = params["init_nodes"] else: init_nodes = "True" if (isinstance(init_nodes, bool) and init_nodes) or \ (isinstance(init_nodes, str) and init_nodes.lower() == "true"): if not server.services: set_services = ["kv"] elif server.services: set_services = server.services.split(',') kv_quota = 0 while kv_quota == 0: time.sleep(1) kv_quota = int(rest.get_nodes_self().mcdMemoryReserved) info = rest.get_nodes_self() cb_version = info.version[:5] if cb_version in COUCHBASE_FROM_VERSION_4: if "index" in set_services and "fts" not in set_services: log.info("quota for index service will be %s MB" \ % (INDEX_QUOTA)) kv_quota = int(info.mcdMemoryReserved * 2/3) - INDEX_QUOTA log.info("set index quota to node %s " % server.ip) rest.set_indexer_memoryQuota(indexMemoryQuota=INDEX_QUOTA) if kv_quota < MIN_KV_QUOTA: raise Exception("KV RAM needs to be more than %s MB" " at node %s" % (MIN_KV_QUOTA, server.ip)) elif "index" in set_services and "fts" in set_services: log.info("quota for index service will be %s MB" \ % (INDEX_QUOTA)) log.info("quota for fts service will be %s MB" \ % (FTS_QUOTA)) kv_quota = int(info.mcdMemoryReserved * 2/3)\ - INDEX_QUOTA \ - FTS_QUOTA log.info("set both index and fts quota at node %s "\ % server.ip) rest.set_indexer_memoryQuota(indexMemoryQuota=INDEX_QUOTA) rest.set_fts_memoryQuota(ftsMemoryQuota=FTS_QUOTA) if kv_quota < MIN_KV_QUOTA: raise Exception("KV RAM need to be more than %s MB" " at node %s" % (MIN_KV_QUOTA, server.ip)) elif "fts" in set_services and "index" not in set_services: log.info("quota for fts service will be %s MB" \ % (FTS_QUOTA)) kv_quota = int(info.mcdMemoryReserved * 2/3) - FTS_QUOTA if kv_quota < MIN_KV_QUOTA: raise Exception("KV RAM need to be more than %s MB" " at node %s" % (MIN_KV_QUOTA, server.ip)) """ for fts, we need to grep quota from ns_server but need to make it works even RAM of vm is smaller than 2 GB """ rest.set_fts_memoryQuota(ftsMemoryQuota=FTS_QUOTA) """ set kv quota smaller than 1 MB so that it will satify the condition smaller than allow quota """ kv_quota -= 1 log.info("quota for kv: %s MB" % kv_quota) rest.init_cluster_memoryQuota(server.rest_username, \ server.rest_password, \ kv_quota) if params["version"][:5] in COUCHBASE_FROM_VERSION_4: rest.init_node_services(username=server.rest_username, password=server.rest_password, services=set_services) rest.init_cluster(username=server.rest_username, password=server.rest_password) # Optionally disable consistency check if params.get('disable_consistency', 0): rest.set_couchdb_option(section='couchdb', option='consistency_check_ratio', value='0.0') # memcached env variable mem_req_tap_env = params.get('MEMCACHED_REQS_TAP_EVENT', 0) if mem_req_tap_env: remote_client.set_environment_variable('MEMCACHED_REQS_TAP_EVENT', mem_req_tap_env) """ set cbauth environment variables from Watson version it is checked version inside method """ remote_client.set_cbauth_env(server) remote_client.disconnect() # TODO: Make it work with windows if "erlang_threads" in params: num_threads = params.get('erlang_threads', testconstants.NUM_ERLANG_THREADS) # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) if "sync_threads" in params or ':' in num_threads: sync_threads = params.get('sync_threads', True) else: sync_threads = False # Change type of threads(sync/async) and num erlang threads ClusterOperationHelper.change_erlang_threads_values([server], sync_threads, num_threads) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) if "erlang_gc_level" in params: erlang_gc_level = params.get('erlang_gc_level', None) if erlang_gc_level is None: # Don't change the value break # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) # Change num erlang threads ClusterOperationHelper.change_erlang_gc([server], erlang_gc_level) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) cluster_initialized = True break except ServerUnavailableException: log.error("error happened while initializing the cluster @ {0}".format(server.ip)) log.info('sleep for 5 seconds before trying again ...') time.sleep(5) if not cluster_initialized: sys.exit("unable to initialize couchbase node")
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master)
def initialize(self, params): start_time = time.time() cluster_initialized = False server = params["server"] remote_client = RemoteMachineShellConnection(params["server"]) while time.time() < start_time + 5 * 60: try: rest = RestConnection(server) # Optionally change node name and restart server if params.get('use_domain_names', 0): RemoteUtilHelper.use_hostname_for_server_settings(server) # Make sure that data_path and index_path are writable by couchbase user for path in set(filter(None, [server.data_path, server.index_path])): time.sleep(3) for cmd in ("rm -rf {0}/*".format(path), "chown -R couchbase:couchbase {0}".format(path)): remote_client.execute_command(cmd) rest.set_data_path(data_path=server.data_path, index_path=server.index_path) time.sleep(3) # Initialize cluster if "init_nodes" in params: init_nodes = params["init_nodes"] else: init_nodes = "True" if init_nodes.lower() == "true": rest.init_cluster(username=server.rest_username, password=server.rest_password) memory_quota = rest.get_nodes_self().mcdMemoryReserved rest.init_cluster_memoryQuota(memoryQuota=memory_quota) # TODO: Symlink data-dir to custom path # remote_client.stop_couchbase() # remote_client.execute_command('mv /opt/couchbase/var {0}'.format(server.data_path)) # remote_client.execute_command('ln -s {0}/var /opt/couchbase/var'.format(server.data_path)) # remote_client.execute_command("chown -h couchbase:couchbase /opt/couchbase/var") # remote_client.start_couchbase() # Optionally disable consistency check if params.get('disable_consistency', 0): rest.set_couchdb_option(section='couchdb', option='consistency_check_ratio', value='0.0') # memcached env variable mem_req_tap_env = params.get('MEMCACHED_REQS_TAP_EVENT', 0) if mem_req_tap_env: remote_client.set_environment_variable('MEMCACHED_REQS_TAP_EVENT', mem_req_tap_env) remote_client.disconnect() # TODO: Make it work with windows if "erlang_threads" in params: num_threads = params.get('erlang_threads', testconstants.NUM_ERLANG_THREADS) # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) if "sync_threads" in params or ':' in num_threads: sync_threads = params.get('sync_threads', True) else: sync_threads = False # Change type of threads(sync/async) and num erlang threads ClusterOperationHelper.change_erlang_threads_values([server], sync_threads, num_threads) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) if "erlang_gc_level" in params: erlang_gc_level = params.get('erlang_gc_level', None) if erlang_gc_level is None: # Don't change the value break # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) # Change num erlang threads ClusterOperationHelper.change_erlang_gc([server], erlang_gc_level) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) cluster_initialized = True break except ServerUnavailableException: log.error("error happened while initializing the cluster @ {0}".format(server.ip)) log.info('sleep for 5 seconds before trying again ...') time.sleep(5) if not cluster_initialized: sys.exit("unable to initialize couchbase node")
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: # let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) rest = RestConnection(self.master) try: msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master) finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090"))
def initialize(self, params): start_time = time.time() cluster_initialized = False server = params["server"] remote_client = RemoteMachineShellConnection(params["server"]) while time.time() < start_time + 5 * 60: try: rest = RestConnection(server) # Optionally change node name and restart server if params.get('use_domain_names', 0): RemoteUtilHelper.use_hostname_for_server_settings(server) # Make sure that data_path and index_path are writable by couchbase user for path in set(filter(None, [server.data_path, server.index_path])): time.sleep(3) for cmd in ("rm -rf {0}/*".format(path), "chown -R couchbase:couchbase {0}".format(path)): remote_client.execute_command(cmd) rest.set_data_path(data_path=server.data_path, index_path=server.index_path) time.sleep(3) # Initialize cluster rest.init_cluster(username=server.rest_username, password=server.rest_password) memory_quota = rest.get_nodes_self().mcdMemoryReserved rest.init_cluster_memoryQuota(memoryQuota=memory_quota) # TODO: Symlink data-dir to custom path # remote_client.stop_couchbase() # remote_client.execute_command('mv /opt/couchbase/var {0}'.format(server.data_path)) # remote_client.execute_command('ln -s {0}/var /opt/couchbase/var'.format(server.data_path)) # remote_client.execute_command("chown -h couchbase:couchbase /opt/couchbase/var") # remote_client.start_couchbase() # Optionally disable consistency check if params.get('disable_consistency', 0): rest.set_couchdb_option(section='couchdb', option='consistency_check_ratio', value='0.0') # memcached env variable mem_req_tap_env = params.get('MEMCACHED_REQS_TAP_EVENT', 0) if mem_req_tap_env: remote_client.set_environment_variable('MEMCACHED_REQS_TAP_EVENT', mem_req_tap_env) remote_client.disconnect() # TODO: Make it work with windows if "erlang_threads" in params: num_threads = params.get('erlang_threads', testconstants.NUM_ERLANG_THREADS) # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) if "sync_threads" in params or ':' in num_threads: sync_threads = params.get('sync_threads', True) else: sync_threads = False # Change type of threads(sync/async) and num erlang threads ClusterOperationHelper.change_erlang_threads_values([server], sync_threads, num_threads) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) if "erlang_gc_level" in params: erlang_gc_level = params.get('erlang_gc_level', None) if erlang_gc_level is None: # Don't change the value break # Stop couchbase-server ClusterOperationHelper.stop_cluster([server]) # Change num erlang threads ClusterOperationHelper.change_erlang_gc([server], erlang_gc_level) # Start couchbase-server ClusterOperationHelper.start_cluster([server]) cluster_initialized = True break except ServerUnavailableException: log.error("error happened while initializing the cluster @ {0}".format(server.ip)) log.info('sleep for 5 seconds before trying again ...') time.sleep(5) if not cluster_initialized: sys.exit("unable to initialize couchbase node")
def test_cbcollect_with_redaction_enabled_with_xdcr(self): rest_src = RestConnection(self.master) rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() rest_dest = RestConnection(self.servers[1]) rest_dest_helper = RestHelper(rest_dest) try: rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() self.set_redaction_level() rest_src.add_remote_cluster(self.servers[1].ip, self.servers[1].port, self.servers[1].rest_username, self.servers[1].rest_password, "C2") """ at dest cluster """ self.add_built_in_server_user(node=self.servers[1]) rest_dest.create_bucket(bucket='default', ramQuotaMB=512) bucket_ready = rest_dest_helper.vbucket_map_ready('default') if not bucket_ready: self.fail("Bucket default at dest not created after 120 seconds.") repl_id = rest_src.start_replication('continuous', 'default', "C2") if repl_id is not None: self.log.info("Replication created successfully") gen = BlobGenerator("ent-backup", "ent-backup-", self.value_size, end=self.num_items) tasks = self._async_load_all_buckets(self.master, gen, "create", 0) for task in tasks: task.result() self.sleep(10) """ enable firewall """ if self.interrupt_replication: RemoteUtilHelper.enable_firewall(self.master, xdcr=True) """ start collect logs """ self.start_logs_collection() result = self.monitor_logs_collection() """ verify logs """ try: logs_path = result["perNode"]["ns_1@" + str(self.master.ip)]["path"] except KeyError: logs_path = result["perNode"]["[email protected]"]["path"] redactFileName = logs_path.split('/')[-1] nonredactFileName = logs_path.split('/')[-1].replace('-redacted', '') remotepath = logs_path[0:logs_path.rfind('/')+1] self.verify_log_files_exist(remotepath=remotepath, redactFileName=redactFileName, nonredactFileName=nonredactFileName) self.log.info("Verify on log ns_server.goxdcr.log") self.verify_log_redaction(remotepath=remotepath, redactFileName=redactFileName, nonredactFileName=nonredactFileName, logFileName="ns_server.goxdcr.log") finally: """ clean up xdcr """ rest_dest.delete_bucket() rest_src.remove_all_replications() rest_src.remove_all_remote_clusters() if self.interrupt_replication: shell = RemoteMachineShellConnection(self.master) shell.disable_firewall() shell.disconnect()
def common_test_body(self, keys_count, replica, load_ratio, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replica : {0}".format(replica)) log.info("load_ratio : {0}".format(load_ratio)) log.info("failover_reason : {0}".format(failover_reason)) master = self._servers[0] log.info('picking server : {0} as the master'.format(master)) rest = RestConnection(master) info = rest.get_nodes_self() rest.init_cluster(username=master.rest_username, password=master.rest_password) rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved) bucket_ram = info.memoryQuota * 2 / 3 bucket = 'default' rest.create_bucket(bucket=bucket, ramQuotaMB=bucket_ram, replicaNumber=replica, proxyPort=info.moxi) ready = BucketOperationHelper.wait_for_memcached(master, bucket) self.assertTrue(ready, "wait_for_memcached_failed") credentials = self._input.membase_settings ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers, credentials, self) nodes = rest.node_statuses() rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed after adding these nodes {0}".format(nodes) self.assertTrue(rest.monitorRebalance(), msg=msg) inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count, load_ratio) inserted_count = len(inserted_keys) log.info('inserted {0} keys'.format(inserted_count)) nodes = rest.node_statuses() while (len(nodes) - replica) > 1: final_replication_state = RestHelper(rest).wait_for_replication( 900) msg = "replication state after waiting for up to 15 minutes : {0}" self.log.info(msg.format(final_replication_state)) chosen = RebalanceHelper.pick_nodes(master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info( "10 seconds delay to wait for membase-server to shutdown" ) #wait for 5 minutes until node is down self.assertTrue( RestHelper(rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall( self._servers, node, bidirectional=self.bidirectional) self.assertTrue( RestHelper(rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) failed_over = rest.fail_over(node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue( failed_over, "unable to failover node after {0}".format( failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node.ip) log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) FailoverBaseTest.replication_verification(master, bucket, replica, inserted_count, self) nodes = rest.node_statuses() FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
def test_node_memcached_failure_in_series(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(xrange(len(self.servers))): print self.servers[i] operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print "operation", operation if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def common_test_body(self, keys_count, replica, load_ratio, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replica : {0}".format(replica)) log.info("load_ratio : {0}".format(load_ratio)) log.info("failover_reason : {0}".format(failover_reason)) master = self._servers[0] log.info('picking server : {0} as the master'.format(master)) rest = RestConnection(master) info = rest.get_nodes_self() rest.init_cluster(username=master.rest_username, password=master.rest_password) rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved) bucket_ram = info.memoryQuota * 2 / 3 bucket = 'default' rest.create_bucket(bucket=bucket, ramQuotaMB=bucket_ram, replicaNumber=replica, proxyPort=info.moxi) ready = BucketOperationHelper.wait_for_memcached(master, bucket) self.assertTrue(ready, "wait_for_memcached_failed") credentials = self._input.membase_settings ClusterOperationHelper.add_all_nodes_or_assert(master, self._servers, credentials, self) nodes = rest.node_statuses() rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed after adding these nodes {0}".format(nodes) self.assertTrue(rest.monitorRebalance(), msg=msg) inserted_keys = FailoverBaseTest.load_data(master, bucket, keys_count, load_ratio) inserted_count = len(inserted_keys) log.info('inserted {0} keys'.format(inserted_count)) nodes = rest.node_statuses() while (len(nodes) - replica) > 1: final_replication_state = RestHelper(rest).wait_for_replication(900) msg = "replication state after waiting for up to 15 minutes : {0}" self.log.info(msg.format(final_replication_state)) chosen = RebalanceHelper.pick_nodes(master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall(self._servers, node, bidirectional=self.bidirectional) self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") #try again in 60 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) #REMOVEME - log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(), msg=msg) FailoverBaseTest.replication_verification(master, bucket, replica, inserted_count, self) nodes = rest.node_statuses() FailoverBaseTest.verify_data(master, inserted_keys, bucket, self)
def test_node_memcached_failure_in_series(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(xrange(len(self.servers))): print self.servers[i] operation = random.choice( ['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print "operation", operation if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[ node.id for node in self.rest.node_statuses() ], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def common_test_body(self, keys_count, replica, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(replica)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self._servers) _servers_ = self._servers rest = RestConnection(self.master) nodes = rest.node_statuses() self._wait_for_replication(self._servers, timeout=600) chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica) for node in chosen: #let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info( "10 seconds delay to wait for membase-server to shutdown") #wait for 5 minutes until node is down self.assertTrue( RestHelper(rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": RemoteUtilHelper.enable_firewall( self._servers, node, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status( node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: #verify iptables on the node if something wrong for server in self._servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.assertTrue( status, msg= "node status is not unhealthy even after waiting for 5 minutes" ) failed_over = rest.fail_over(node.id) if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) #try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue( failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: log.info( "10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") self._wait_for_stats_all_buckets(_servers_) self._wait_for_replication(self._servers, timeout=600) self._verify_stats_all_buckets(_servers_) self._verify_all_buckets(self.master)
def start_firewall_on_node(self, node): """ Method to start a server which is subject to failover """ for server in self.cluster.servers: if server.ip == node.ip: RemoteUtilHelper.enable_firewall(server)