def test_node_reboot(self): wait_timeout = 120 timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_node_reboot(self): wait_timeout = 120 timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) if shell.extract_remote_info().type.lower() == 'windows': time.sleep(wait_timeout * 5) else: time.sleep(wait_timeout) # disable firewall on the node shell = RemoteMachineShellConnection(self.server_fail) shell.disable_firewall() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_node_cb_restart(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.restart_couchbase() AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_two_failed_nodes(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertFalse(helper.is_cluster_healthy(), "cluster status is healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") self._start_couchbase(server_fail1) self._start_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self._start_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(20) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance())
def test_two_failed_nodes(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertFalse(helper.is_cluster_healthy(), "cluster status is healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") self._start_couchbase(server_fail1) self._start_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self._start_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(20) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance())
def test_node_memcached_failure(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) self._pause_couchbase(self.server_fail) self.sleep(5) AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) RemoteUtilHelper.common_basic_setup([self.server_fail]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_node_cb_restart(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) shell = RemoteMachineShellConnection(self.server_fail) shell.restart_couchbase() AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.sleep(5) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) buckets = self.rest.get_buckets() for bucket in buckets: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_ui_logs(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) logs = self.rest.get_logs(5) self.assertTrue( u'Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs]) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 0' in [l['text'] for l in logs]) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 2) logs = self.rest.get_logs(5) self.assertTrue( u'auto-reprovision is disabled as maximum number of nodes (2) ' u'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs]) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 2' in [l['text'] for l in logs]) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance( otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) logs = self.rest.get_logs(5) # https://issues.couchbase.com/browse/MB-24520 self.assertFalse( u'Reset auto-failover count' in [l['text'] for l in logs]) self.assertTrue( u'Rebalance completed successfully.' in [l['text'] for l in logs])
def test_node_memcached_failure_in_series(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(xrange(len(self.servers))): print self.servers[i] operation = random.choice( ['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print "operation", operation if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert( self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[ node.id for node in self.rest.node_statuses() ], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
def test_ui_logs(self): timeout = self.timeout / 2 server_fail1 = self.servers[1] server_fail2 = self.servers[2] status = self.rest.update_autoreprovision_settings(True, 2) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) logs = self.rest.get_logs(5) self.assertTrue(u'Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs]) self.log.info("stopping the first server") self._stop_couchbase(server_fail1) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 0' in [l['text'] for l in logs]) self.log.info("stopping the second server") self._stop_couchbase(server_fail2) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) self._start_couchbase(server_fail2) self._start_couchbase(server_fail1) self.sleep(30) settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 2) logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision is disabled as maximum number of nodes (2) ' u'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs]) self.log.info("resetting the autoreprovision count") if not self.rest.reset_autoreprovision(): self.fail('failed to reset autoreprovision count!') settings = self.rest.get_autoreprovision_settings() self.assertEquals(settings.enabled, True) self.assertEquals(settings.max_nodes, 2) self.assertEquals(settings.count, 0) logs = self.rest.get_logs(5) self.assertTrue(u'auto-reprovision count reset from 2' in [l['text'] for l in logs]) helper = RestHelper(self.rest) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) logs = self.rest.get_logs(5) # https://issues.couchbase.com/browse/MB-24520 self.assertFalse(u'Reset auto-failover count' in [l['text'] for l in logs]) self.assertTrue(u'Rebalance completed successfully.' in [l['text'] for l in logs])
def test_node_memcached_failure_in_series(self): timeout = self.timeout / 2 status = self.rest.update_autoreprovision_settings(True, 1) if not status: self.fail('failed to change autoreprovision_settings!') self.sleep(5) data_lost = False for i in reversed(xrange(len(self.servers))): print self.servers[i] operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot']) shell = RemoteMachineShellConnection(self.servers[i]) print "operation", operation if i == 0: self.master = self.servers[1] if operation == 'stop': self._stop_couchbase(self.servers[i]) elif operation == 'memcached_failure': self._pause_couchbase(self.servers[i]) elif operation == 'restart': shell.restart_couchbase() elif operation == 'failover': RemoteUtilHelper.enable_firewall(self.servers[i]) elif operation == 'reboot': if shell.extract_remote_info().type.lower() == 'windows': o, r = shell.execute_command("shutdown -r -f -t 0") self.sleep(200) elif shell.extract_remote_info().type.lower() == 'linux': o, r = shell.execute_command("reboot") shell.log_command_output(o, r) self.sleep(60) self.sleep(40) if operation == 'memcached_failure': AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot': AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) if operation != 'restart': RemoteUtilHelper.common_basic_setup([self.servers[i]]) AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0, timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self) helper = RestHelper(RestConnection(self.master)) self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy") self.sleep(40) if operation == 'memcached_failure' or operation == 'failover': self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") else: if 'kv' in self.servers[i].services and self.replicas > 0: self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced") self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[]) self.assertTrue(self.rest.monitorRebalance()) else: self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced") buckets = self.rest.get_buckets() if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'): data_lost = True for bucket in buckets: if not data_lost: self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])