def test_missing_num_shards(params_from_base_test_setup, sg_conf): """ 1. Launch sg_accels missing the following property in the config. "num_shards":16 2. Verify there are 16 shards 3. Verify they are distributed evenly across the nodes """ cluster_conf = params_from_base_test_setup["cluster_config"] log_info("Running 'test_missing_num_shards'") log_info("cluster_conf: {}".format(cluster_conf)) log_info("sg_conf: {}".format(sg_conf)) cluster = Cluster(config=cluster_conf) cluster.reset(sg_config_path=sg_conf) # CBGT REST Admin API endpoint admin_api = Admin(cluster.sg_accels[1]) cbgt_cfg = admin_api.get_cbgt_config() # Verify that default number of pindex shards is 16. # This may change in the future in which case this test will need to be updated. assert cbgt_cfg.num_shards == 16 # Verify sharding is correct assert cluster.validate_cbgt_pindex_distribution_retry( num_running_sg_accels=3)
def test_take_all_sgaccels_down(params_from_base_test_setup, sg_conf): """ Scenario that takes all sync_gateway accel nodes offline during doc load. After bring the nodes back online during load, the reshard of the DCP feed is verified. The changes feed is verified that all docs show up. 1. Start doc load (1000 doc) 2. Take all sg_accel nodes down in parallel 3. Verify node are down 4. Wait for doc adds to complete, store "doc_push_result_1" 5. Verify "doc_push_result_1" docs added 6. Start doc load (1000 docs) 7. Wait for 5. to complete, store "doc_push_result_2" 8. Verify "doc_push_result_2" docs added 9. Start another doc load (1000 docs) 10. Bring up nodes in parallel 11. poll on p-index reshard 12. Wait for 9. to complete, store "doc_push_result_3" 13. Verify "doc_push_result_3" docs added 14. Verify "doc_push_result_1" + "doc_push_result_2" + "doc_push_result_3" show up in _changes feed """ cluster_conf = params_from_base_test_setup["cluster_config"] log_info("Running 'test_dcp_reshard_single_sg_accel_goes_down_and_up'") log_info("cluster_conf: {}".format(cluster_conf)) log_info("sg_conf: {}".format(sg_conf)) cluster = Cluster(config=cluster_conf) cluster.reset(sg_config_path=sg_conf) cluster_util = ClusterKeywords() topology = cluster_util.get_cluster_topology(cluster_conf) sg_url = topology["sync_gateways"][0]["public"] sg_admin_url = topology["sync_gateways"][0]["admin"] sg_db = "db" num_docs = 1000 client = MobileRestClient() doc_pusher_user_info = userinfo.UserInfo("doc_pusher", "pass", channels=["A"], roles=[]) doc_pusher_auth = client.create_user( url=sg_admin_url, db=sg_db, name=doc_pusher_user_info.name, password=doc_pusher_user_info.password, channels=doc_pusher_user_info.channels) a_user_info = userinfo.UserInfo("a_user", "pass", channels=["A"], roles=[]) client.create_user(url=sg_admin_url, db=sg_db, name=a_user_info.name, password=a_user_info.password, channels=a_user_info.channels) a_user_session = client.create_session(url=sg_admin_url, db=sg_db, name=a_user_info.name, password=a_user_info.password) # Shutdown all accel nodes in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=3) as ex: # Start adding docs docs_1 = document.create_docs(None, num_docs, channels=doc_pusher_user_info.channels) docs_1_task = ex.submit(client.add_bulk_docs, url=sg_url, db=sg_db, docs=docs_1, auth=doc_pusher_auth) # Take down all access nodes log_info("Shutting down sg_accels: [{}, {}, {}] ...".format( cluster.sg_accels[0], cluster.sg_accels[1], cluster.sg_accels[2])) sg_accel_down_task_1 = ex.submit(cluster.sg_accels[0].stop) sg_accel_down_task_2 = ex.submit(cluster.sg_accels[1].stop) sg_accel_down_task_3 = ex.submit(cluster.sg_accels[2].stop) assert sg_accel_down_task_1.result() == 0 assert sg_accel_down_task_2.result() == 0 assert sg_accel_down_task_3.result() == 0 # Block until bulk_docs is complete doc_push_result_1 = docs_1_task.result() assert len(doc_push_result_1) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=doc_push_result_1, auth=doc_pusher_auth) # Load sync_gateway with another batch of docs while the sg_accel nodes are offline docs_2_bodies = document.create_docs( None, num_docs, channels=doc_pusher_user_info.channels) docs_push_result_2 = client.add_bulk_docs(url=sg_url, db=sg_db, docs=docs_2_bodies, auth=doc_pusher_auth) assert len(docs_push_result_2) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=docs_push_result_2, auth=doc_pusher_auth) # Start loading Sync Gateway with another set of docs while bringing the sg_accel nodes online docs_3 = document.create_docs(None, num_docs, channels=doc_pusher_user_info.channels) docs_3_task = ex.submit(client.add_bulk_docs, url=sg_url, db=sg_db, docs=docs_3, auth=doc_pusher_auth) # Bring all the sg_accel nodes back up # Take down all access nodes log_info("Starting sg_accels: [{}, {}, {}] ...".format( cluster.sg_accels[0], cluster.sg_accels[1], cluster.sg_accels[2])) sg_accel_up_task_1 = ex.submit(cluster.sg_accels[0].start, sg_conf) sg_accel_up_task_2 = ex.submit(cluster.sg_accels[1].start, sg_conf) sg_accel_up_task_3 = ex.submit(cluster.sg_accels[2].start, sg_conf) assert sg_accel_up_task_1.result() == 0 assert sg_accel_up_task_2.result() == 0 assert sg_accel_up_task_3.result() == 0 # Wait for pindex to reshard correctly assert cluster.validate_cbgt_pindex_distribution_retry(3) # Block until second bulk_docs is complete doc_push_result_3 = docs_3_task.result() assert len(doc_push_result_3) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=doc_push_result_3, auth=doc_pusher_auth) # Combine the 3 push results and make sure the changes propagate to a_user # a_user has access to the doc's channel. log_info("Verifying all the changes show up for 'a_user' ...") all_docs = doc_push_result_1 + docs_push_result_2 + doc_push_result_3 client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=all_docs, auth=a_user_session, polling_interval=2)
def test_take_all_sgaccels_down(params_from_base_test_setup, sg_conf): """ Scenario that takes all sync_gateway accel nodes offline during doc load. After bring the nodes back online during load, the reshard of the DCP feed is verified. The changes feed is verified that all docs show up. 1. Start doc load (1000 doc) 2. Take all sg_accel nodes down in parallel 3. Verify node are down 4. Wait for doc adds to complete, store "doc_push_result_1" 5. Verify "doc_push_result_1" docs added 6. Start doc load (1000 docs) 7. Wait for 5. to complete, store "doc_push_result_2" 8. Verify "doc_push_result_2" docs added 9. Start another doc load (1000 docs) 10. Bring up nodes in parallel 11. poll on p-index reshard 12. Wait for 9. to complete, store "doc_push_result_3" 13. Verify "doc_push_result_3" docs added 14. Verify "doc_push_result_1" + "doc_push_result_2" + "doc_push_result_3" show up in _changes feed """ cluster_conf = params_from_base_test_setup["cluster_config"] log_info("Running 'test_dcp_reshard_single_sg_accel_goes_down_and_up'") log_info("cluster_conf: {}".format(cluster_conf)) log_info("sg_conf: {}".format(sg_conf)) cluster = Cluster(config=cluster_conf) cluster.reset(sg_config_path=sg_conf) cluster_util = ClusterKeywords() topology = cluster_util.get_cluster_topology(cluster_conf) sg_url = topology["sync_gateways"][0]["public"] sg_admin_url = topology["sync_gateways"][0]["admin"] sg_db = "db" num_docs = 1000 client = MobileRestClient() doc_pusher_user_info = userinfo.UserInfo("doc_pusher", "pass", channels=["A"], roles=[]) doc_pusher_auth = client.create_user( url=sg_admin_url, db=sg_db, name=doc_pusher_user_info.name, password=doc_pusher_user_info.password, channels=doc_pusher_user_info.channels ) a_user_info = userinfo.UserInfo("a_user", "pass", channels=["A"], roles=[]) client.create_user( url=sg_admin_url, db=sg_db, name=a_user_info.name, password=a_user_info.password, channels=a_user_info.channels ) a_user_session = client.create_session( url=sg_admin_url, db=sg_db, name=a_user_info.name, password=a_user_info.password ) # Shutdown all accel nodes in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=3) as ex: # Start adding docs docs_1 = document.create_docs(None, num_docs, channels=doc_pusher_user_info.channels) docs_1_task = ex.submit(client.add_bulk_docs, url=sg_url, db=sg_db, docs=docs_1, auth=doc_pusher_auth) # Take down all access nodes log_info("Shutting down sg_accels: [{}, {}, {}] ...".format( cluster.sg_accels[0], cluster.sg_accels[1], cluster.sg_accels[2] )) sg_accel_down_task_1 = ex.submit(cluster.sg_accels[0].stop) sg_accel_down_task_2 = ex.submit(cluster.sg_accels[1].stop) sg_accel_down_task_3 = ex.submit(cluster.sg_accels[2].stop) assert sg_accel_down_task_1.result() == 0 assert sg_accel_down_task_2.result() == 0 assert sg_accel_down_task_3.result() == 0 # Block until bulk_docs is complete doc_push_result_1 = docs_1_task.result() assert len(doc_push_result_1) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=doc_push_result_1, auth=doc_pusher_auth) # Load sync_gateway with another batch of docs while the sg_accel nodes are offline docs_2_bodies = document.create_docs(None, num_docs, channels=doc_pusher_user_info.channels) docs_push_result_2 = client.add_bulk_docs(url=sg_url, db=sg_db, docs=docs_2_bodies, auth=doc_pusher_auth) assert len(docs_push_result_2) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=docs_push_result_2, auth=doc_pusher_auth) # Start loading Sync Gateway with another set of docs while bringing the sg_accel nodes online docs_3 = document.create_docs(None, num_docs, channels=doc_pusher_user_info.channels) docs_3_task = ex.submit(client.add_bulk_docs, url=sg_url, db=sg_db, docs=docs_3, auth=doc_pusher_auth) # Bring all the sg_accel nodes back up # Take down all access nodes log_info("Starting sg_accels: [{}, {}, {}] ...".format( cluster.sg_accels[0], cluster.sg_accels[1], cluster.sg_accels[2] )) sg_accel_up_task_1 = ex.submit(cluster.sg_accels[0].start, sg_conf) sg_accel_up_task_2 = ex.submit(cluster.sg_accels[1].start, sg_conf) sg_accel_up_task_3 = ex.submit(cluster.sg_accels[2].start, sg_conf) assert sg_accel_up_task_1.result() == 0 assert sg_accel_up_task_2.result() == 0 assert sg_accel_up_task_3.result() == 0 # Wait for pindex to reshard correctly assert cluster.validate_cbgt_pindex_distribution_retry(3) # Block until second bulk_docs is complete doc_push_result_3 = docs_3_task.result() assert len(doc_push_result_3) == num_docs client.verify_docs_present(url=sg_url, db=sg_db, expected_docs=doc_push_result_3, auth=doc_pusher_auth) # Combine the 3 push results and make sure the changes propagate to a_user # a_user has access to the doc's channel. log_info("Verifying all the changes show up for 'a_user' ...") all_docs = doc_push_result_1 + docs_push_result_2 + doc_push_result_3 client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=all_docs, auth=a_user_session, polling_interval=2)
def test_take_down_bring_up_sg_accel_validate_cbgt(params_from_base_test_setup, sg_conf): """ Scenario 1 Start with 3 sg_accels Take down 2 sg_accels (block until down -- poll port if needed) Doc adds with uuids (~30 sec for cbgt to reshard) polling loop: wait for all docs to come back over changes feed Call validate pindex with correct number of accels Scenario 2 (Continuation) When bringing up, you'd have to poll the cbgt_cfg until you get expected number of nodes, then you could validate the pindex with 2 accels """ cluster_conf = params_from_base_test_setup["cluster_config"] log_info("Running 'test_dcp_reshard_single_sg_accel_goes_down_and_up'") log_info("cluster_conf: {}".format(cluster_conf)) log_info("sg_conf: {}".format(sg_conf)) cluster = Cluster(config=cluster_conf) cluster.reset(sg_config_path=sg_conf) cluster_util = ClusterKeywords() topology = cluster_util.get_cluster_topology(cluster_conf) sg_url = topology["sync_gateways"][0]["public"] sg_admin_url = topology["sync_gateways"][0]["admin"] sg_db = "db" client = MobileRestClient() doc_pusher_user_info = userinfo.UserInfo("doc_pusher", "pass", channels=["A"], roles=[]) doc_pusher_auth = client.create_user( url=sg_admin_url, db=sg_db, name=doc_pusher_user_info.name, password=doc_pusher_user_info.password, channels=doc_pusher_user_info.channels) log_info("Shutting down sg_accels: [{}, {}]".format( cluster.sg_accels[1], cluster.sg_accels[2])) # Shutdown two accel nodes in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=3) as ex: sg_accel_down_task_1 = ex.submit(cluster.sg_accels[1].stop) sg_accel_down_task_2 = ex.submit(cluster.sg_accels[2].stop) assert sg_accel_down_task_1.result() == 0 assert sg_accel_down_task_2.result() == 0 log_info("Finished taking nodes down!") # It should take some time ~30 for cbgt to pick up failing nodes and reshard the pindexes. During # this add a 1000 docs a start a longpoll changes loop to see if those docs make to to the changes feed # If the reshard is successful they will show up at somepoint after. If not, the docs will fail to show up. doc_pusher_docs = client.add_docs(url=sg_url, db=sg_db, number=1000, id_prefix=None, auth=doc_pusher_auth, channels=doc_pusher_user_info.channels) assert len(doc_pusher_docs) == 1000 client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=doc_pusher_docs, auth=doc_pusher_auth, polling_interval=5) # The pindexes should be reshared at this point since all of the changes have shown up assert cluster.validate_cbgt_pindex_distribution(num_running_sg_accels=1) log_info("Start sg_accels: [{}, {}]".format(cluster.sg_accels[1], cluster.sg_accels[2])) # Start two accel nodes in parallel status = cluster.sg_accels[1].start(sg_conf) assert status == 0 # Poll on pIndex reshard after bring 2 accel nodes back assert cluster.validate_cbgt_pindex_distribution_retry( num_running_sg_accels=2) status = cluster.sg_accels[2].start(sg_conf) assert status == 0 # Poll on pIndex reshard after bring 2 accel nodes back assert cluster.validate_cbgt_pindex_distribution_retry( num_running_sg_accels=3)
def test_take_down_bring_up_sg_accel_validate_cbgt(params_from_base_test_setup, sg_conf): """ Scenario 1 Start with 3 sg_accels Take down 2 sg_accels (block until down -- poll port if needed) Doc adds with uuids (~30 sec for cbgt to reshard) polling loop: wait for all docs to come back over changes feed Call validate pindex with correct number of accels Scenario 2 (Continuation) When bringing up, you'd have to poll the cbgt_cfg until you get expected number of nodes, then you could validate the pindex with 2 accels """ cluster_conf = params_from_base_test_setup["cluster_config"] log_info("Running 'test_dcp_reshard_single_sg_accel_goes_down_and_up'") log_info("cluster_conf: {}".format(cluster_conf)) log_info("sg_conf: {}".format(sg_conf)) cluster = Cluster(config=cluster_conf) cluster.reset(sg_config_path=sg_conf) cluster_util = ClusterKeywords() topology = cluster_util.get_cluster_topology(cluster_conf) sg_url = topology["sync_gateways"][0]["public"] sg_admin_url = topology["sync_gateways"][0]["admin"] sg_db = "db" client = MobileRestClient() doc_pusher_user_info = userinfo.UserInfo("doc_pusher", "pass", channels=["A"], roles=[]) doc_pusher_auth = client.create_user( url=sg_admin_url, db=sg_db, name=doc_pusher_user_info.name, password=doc_pusher_user_info.password, channels=doc_pusher_user_info.channels ) log_info("Shutting down sg_accels: [{}, {}]".format(cluster.sg_accels[1], cluster.sg_accels[2])) # Shutdown two accel nodes in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=3) as ex: sg_accel_down_task_1 = ex.submit(cluster.sg_accels[1].stop) sg_accel_down_task_2 = ex.submit(cluster.sg_accels[2].stop) assert sg_accel_down_task_1.result() == 0 assert sg_accel_down_task_2.result() == 0 log_info("Finished taking nodes down!") # It should take some time ~30 for cbgt to pick up failing nodes and reshard the pindexes. During # this add a 1000 docs a start a longpoll changes loop to see if those docs make to to the changes feed # If the reshard is successful they will show up at somepoint after. If not, the docs will fail to show up. doc_pusher_docs = client.add_docs( url=sg_url, db=sg_db, number=1000, id_prefix=None, auth=doc_pusher_auth, channels=doc_pusher_user_info.channels ) assert len(doc_pusher_docs) == 1000 client.verify_docs_in_changes(url=sg_url, db=sg_db, expected_docs=doc_pusher_docs, auth=doc_pusher_auth, polling_interval=5) # The pindexes should be reshared at this point since all of the changes have shown up assert cluster.validate_cbgt_pindex_distribution(num_running_sg_accels=1) log_info("Start sg_accels: [{}, {}]".format(cluster.sg_accels[1], cluster.sg_accels[2])) # Start two accel nodes in parallel status = cluster.sg_accels[1].start(sg_conf) assert status == 0 # Poll on pIndex reshard after bring 2 accel nodes back assert cluster.validate_cbgt_pindex_distribution_retry(num_running_sg_accels=2) status = cluster.sg_accels[2].start(sg_conf) assert status == 0 # Poll on pIndex reshard after bring 2 accel nodes back assert cluster.validate_cbgt_pindex_distribution_retry(num_running_sg_accels=3)