def test_rebalance_sanity(params_from_base_test_setup): cluster_config = params_from_base_test_setup["cluster_config"] mode = params_from_base_test_setup["mode"] cluster_helper = ClusterKeywords() sg_conf_name = "sync_gateway_default_functional_tests" sg_conf_path = sync_gateway_config_path_for_mode(sg_conf_name, mode) cluster_helper.reset_cluster(cluster_config=cluster_config, sync_gateway_config=sg_conf_path) topology = cluster_helper.get_cluster_topology(cluster_config) admin_sg_one = topology["sync_gateways"][0]["admin"] sg_one_url = topology["sync_gateways"][0]["public"] cluster_servers = topology["couchbase_servers"] cbs_one_url = cluster_servers[0] cbs_two_url = cluster_servers[1] log_info("Running: 'test_distributed_index_rebalance_sanity'") log_info("cluster_config: {}".format(cluster_config)) log_info("admin_sg: {}".format(admin_sg_one)) log_info("sg_url: {}".format(sg_one_url)) log_info("cbs_one_url: {}".format(cbs_one_url)) log_info("cbs_two_url: {}".format(cbs_two_url)) sg_db = "db" num_docs = 100 num_updates = 100 sg_user_name = "seth" sg_user_password = "******" channels = ["ABC", "CBS"] client = MobileRestClient() cb_server = CouchbaseServer(cbs_one_url) server_to_remove = CouchbaseServer(cbs_two_url) client.create_user(admin_sg_one, sg_db, sg_user_name, sg_user_password, channels=channels) session = client.create_session(admin_sg_one, sg_db, sg_user_name) with concurrent.futures.ThreadPoolExecutor(5) as executor: # Add docs to sg log_info("Adding docs to sync_gateway") docs = client.add_docs(sg_one_url, sg_db, num_docs, "test_doc", channels=channels, auth=session) assert len(docs) == num_docs # Start updating docs and rebalance out one CBS node log_info("Updating docs on sync_gateway") update_docs_task = executor.submit(client.update_docs, sg_one_url, sg_db, docs, num_updates, auth=session) # Run rebalance in background cb_server.rebalance_out(cluster_servers, server_to_remove) updated_docs = update_docs_task.result() log_info(updated_docs) # Verify docs / revisions present client.verify_docs_present(sg_one_url, sg_db, updated_docs, auth=session) # Verify docs revisions in changes feed client.verify_docs_in_changes(sg_one_url, sg_db, updated_docs, auth=session) # Rebalance Server back in to the pool cb_server.add_node(server_to_remove) cb_server.rebalance_in(cluster_servers, server_to_remove)
def test_server_goes_down_sanity(params_from_base_test_setup): """ 1. Start with a two node couchbase server cluster 2. Starting adding docs 3. Kill one of the server nodes and signal completion 4. Stop adding docs 5. Verify that that the expected docs are present and in the changes feed. 6. Start server again and add to cluster """ cluster_config = params_from_base_test_setup["cluster_config"] mode = params_from_base_test_setup["mode"] cluster_helper = ClusterKeywords() sg_conf_name = "sync_gateway_default_functional_tests" sg_conf_path = sync_gateway_config_path_for_mode(sg_conf_name, mode) cluster_helper.reset_cluster(cluster_config=cluster_config, sync_gateway_config=sg_conf_path) topology = cluster_helper.get_cluster_topology(cluster_config) admin_sg = topology["sync_gateways"][0]["admin"] sg_url = topology["sync_gateways"][0]["public"] coucbase_servers = topology["couchbase_servers"] cbs_one_url = coucbase_servers[0] cbs_two_url = coucbase_servers[1] log_info("Running: 'test_server_goes_down_sanity'") log_info("cluster_config: {}".format(cluster_config)) log_info("admin_sg: {}".format(admin_sg)) log_info("sg_url: {}".format(sg_url)) log_info("cbs_one_url: {}".format(cbs_one_url)) log_info("cbs_two_url: {}".format(cbs_two_url)) sg_db = "db" num_docs = 100 sg_user_name = "seth" sg_user_password = "******" channels = ["ABC", "CBS"] client = MobileRestClient() main_server = CouchbaseServer(cbs_one_url) flakey_server = CouchbaseServer(cbs_two_url) client.create_user(admin_sg, sg_db, sg_user_name, sg_user_password, channels=channels) session = client.create_session(admin_sg, sg_db, sg_user_name) # Stop second server flakey_server.stop() # Try to add 100 docs in a loop until all succeed, if the never do, fail with timeout errors = num_docs # Wait 30 seconds for auto failover # (Minimum value suggested - http://docs.couchbase.com/admin/admin/Tasks/tasks-nodeFailover.html) # + 15 seconds to add docs timeout = 45 start = time.time() successful_add = False while not successful_add: # Fail tests if all docs do not succeed before timeout if (time.time() - start) > timeout: # Bring server back up before failing the test flakey_server.start() main_server.rebalance_in(coucbase_servers, flakey_server) raise TimeoutError("Failed to successfully put docs before timeout") try: docs = client.add_docs(url=sg_url, db=sg_db, number=num_docs, id_prefix=None, auth=session, channels=channels) # If the above add doc does not throw, it was a successfull add. successful_add = True except requests.exceptions.HTTPError as he: log_info("Failed to add docs: {}".format(he)) log_info("Seeing: {} errors".format(errors)) time.sleep(1) assert len(docs) == 100 client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=docs, auth=session) try: client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=docs, auth=session, polling_interval=5) except keywords.exceptions.TimeoutException: # timeout verifying docs. Bring server back in to restore topology, then fail # Failing due to https://github.com/couchbase/sync_gateway/issues/2197 flakey_server.start() main_server.recover(flakey_server) main_server.rebalance_in(coucbase_servers, flakey_server) raise keywords.exceptions.TimeoutException("Failed to get all changes") # Test succeeded without timeout, bring server back into topology flakey_server.start() main_server.recover(flakey_server) main_server.rebalance_in(coucbase_servers, flakey_server) # Make sure all docs were not added before server was log_info("test_server_goes_down_sanity complete!")
def test_server_goes_down_rebuild_channels(params_from_base_test_setup): """ 1. Start with a two node couchbase server cluster 2. Starting adding docs 3. Kill one of the server nodes and signal completion 4. Stop adding docs 5. Verify that that the expected docs are present and in the changes feed. 6. Start server again and add to cluster """ cluster_config = params_from_base_test_setup["cluster_config"] mode = params_from_base_test_setup["mode"] cluster_helper = ClusterKeywords() sg_conf_name = "sync_gateway_default_functional_tests" sg_conf_path = sync_gateway_config_path_for_mode(sg_conf_name, mode) cluster_helper.reset_cluster(cluster_config=cluster_config, sync_gateway_config=sg_conf_path) topology = cluster_helper.get_cluster_topology(cluster_config) admin_sg = topology["sync_gateways"][0]["admin"] sg_url = topology["sync_gateways"][0]["public"] coucbase_servers = topology["couchbase_servers"] cbs_one_url = coucbase_servers[0] cbs_two_url = coucbase_servers[1] log_info("Running: 'test_server_goes_down_sanity'") log_info("cluster_config: {}".format(cluster_config)) log_info("admin_sg: {}".format(admin_sg)) log_info("sg_url: {}".format(sg_url)) log_info("cbs_one_url: {}".format(cbs_one_url)) log_info("cbs_two_url: {}".format(cbs_two_url)) sg_db = "db" num_docs = 100 admin_user_info = userinfo.UserInfo( name="admin", password="******", channels=["ABC"], roles=[] ) seth_user_info = userinfo.UserInfo( name="seth", password="******", channels=["ABC"], roles=[] ) client = MobileRestClient() main_server = CouchbaseServer(cbs_one_url) flakey_server = CouchbaseServer(cbs_two_url) admin_auth = client.create_user( admin_sg, sg_db, admin_user_info.name, admin_user_info.password, channels=admin_user_info.channels ) client.create_user( admin_sg, sg_db, seth_user_info.name, seth_user_info.password, channels=seth_user_info.channels ) seth_session = client.create_session(admin_sg, sg_db, seth_user_info.name) # allow any user docs to make it to changes initial_changes = client.get_changes(url=sg_url, db=sg_db, since=0, auth=seth_session) # push docs from admin docs = client.add_docs( url=sg_url, db=sg_db, number=num_docs, id_prefix=None, channels=admin_user_info.channels, auth=admin_auth ) assert len(docs) == num_docs client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=docs, auth=seth_session) changes_before_failover = client.get_changes(url=sg_url, db=sg_db, since=initial_changes["last_seq"], auth=seth_session) assert len(changes_before_failover["results"]) == num_docs # Stop server via 'service stop' flakey_server.stop() start = time.time() while True: # Fail tests if all docs do not succeed before timeout if (time.time() - start) > 60: # Bring server back up before failing the test flakey_server.start() main_server.recover(flakey_server) main_server.rebalance_in(coucbase_servers, flakey_server) raise keywords.exceptions.TimeoutError("Failed to rebuild changes") try: # Poll until failover happens (~30 second) client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=docs, auth=seth_session) # changes requests succeeded, exit loop break except requests.exceptions.HTTPError: # Changes will fail until failover of the down server happens. Wait and try again. log_info("/db/_changes failed due to server down. Retrying ...") time.sleep(1) # Verify no new changes changes = client.get_changes( url=sg_url, db=sg_db, since=changes_before_failover["last_seq"], auth=seth_session, feed="normal" ) assert len(changes["results"]) == 0 # Check that all changes are intact from initial changes request changes = client.get_changes(url=sg_url, db=sg_db, since=initial_changes["last_seq"], auth=seth_session) assert len(changes["results"]) == num_docs coucbase_servers = topology["couchbase_servers"] # Test succeeded without timeout, bring server back into topology flakey_server.start() main_server.recover(flakey_server) main_server.rebalance_in(coucbase_servers, flakey_server)