def test_master_node_replace() -> None: # Ideally, the pod will get placed on a different agent. This test will verify that the # remaining two masters find the replaced master at its new IP address. This requires a # reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(package_name, service_name, "pod replace master-0") sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_master_node_replace(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace master-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master
def test_master_reelection(): initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection(): initial_master = config.get_elasticsearch_master(service_name=foldered_name) sdk_cmd.kill_task_with_pattern( "master__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(foldered_name, initial_master)[0].host, ) sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection() -> None: initial_master = config.get_elasticsearch_master(service_name=service_name) sdk_cmd.kill_task_with_pattern( "master__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(service_name, initial_master)[0].host, ) sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) config.wait_for_expected_nodes_to_exist(service_name=service_name) new_master = config.get_elasticsearch_master(service_name=service_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_namenodes_acheive_quorum_after_journalnode_replace(): """ This test aims to check that namenodes recover after a journalnode failure. It checks the fix to this issue works: https://jira.apache.org/jira/browse/HDFS-10659. After the first Journal Node recovery, the second Journal Node pod replace triggers crash looping of both replaced Journal Node pod and all NameNode pods. """ pod_list = ["journal-0", "journal-1", "journal-0"] for pod in pod_list: sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod replace {}".format(pod)) # waiting for recovery to start first before it completes to avoid timing issues sdk_plan.wait_for_in_progress_recovery(service_name=foldered_name, timeout_seconds=5 * 60) # sdk_plan.wait_for_completed_recovery includes tracking of failed tasks and will # terminate in case of a crash loop sdk_plan.wait_for_completed_recovery(service_name=foldered_name, timeout_seconds=5 * 60)
def test_coordinator_node_replace(): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace coordinator-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_node_replace(): # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace master-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_data_node_replace(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace data-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_coordinator_node_replace() -> None: sdk_cmd.svc_cli(package_name, service_name, "pod replace coordinator-0") sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_data_node_replace(): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod replace data-0") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment( config.SERVICE_NAME) log.info("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert (len(deployment_plan['phases']) == 5) assert (deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy') assert (deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy') assert (deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert (deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert (deployment_plan["phases"][4]["name"] == "getter-deploy") assert (len(deployment_plan['phases'][0]['steps']) == 1) assert (len(deployment_plan["phases"][1]["steps"]) == 1) assert (len(deployment_plan["phases"][2]["steps"]) == 1) assert (len(deployment_plan["phases"][3]["steps"]) == 1) assert (len(deployment_plan["phases"][4]["steps"]) == 1) # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so, # we wait for them to redeploy, but if they don't fail we still want to proceed. try: sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME, timeout_seconds=60) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=60) except retrying.RetryError: pass # test that the tasks are all up, which tests the overlay DNS framework_tasks = [ task for task in shakedown.get_service_tasks(config.SERVICE_NAME, completed=False) ] framework_task_names = [t["name"] for t in framework_tasks] for expected_task in EXPECTED_TASKS: assert (expected_task in framework_task_names), "Missing {expected}".format( expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys( ), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys( ), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network("hello-overlay-0-server") sdk_networks.check_task_network("hello-overlay-vip-0-server") sdk_networks.check_task_network("hello-host-0-server", expected_network_name=None) sdk_networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints', json=True) assert len(endpoints_result ) == 2, "Wrong number of endpoints got {} should be 2".format( len(endpoints_result)) overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints overlay-vip', json=True) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints host-vip', json=True) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-host-vip-0-server", 4044)
def restart_zookeeper_node(id: int): sdk_cmd.svc_cli(ZK_PACKAGE, ZK_SERVICE_NAME, "pod restart zookeeper-{}".format(id)) sdk_plan.wait_for_in_progress_recovery(ZK_SERVICE_NAME) sdk_plan.wait_for_completed_recovery(ZK_SERVICE_NAME)
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) log.info("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert(len(deployment_plan['phases']) == 5) assert(deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy') assert(deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy') assert(deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert(deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert(deployment_plan["phases"][4]["name"] == "getter-deploy") assert(len(deployment_plan['phases'][0]['steps']) == 1) assert(len(deployment_plan["phases"][1]["steps"]) == 1) assert(len(deployment_plan["phases"][2]["steps"]) == 1) assert(len(deployment_plan["phases"][3]["steps"]) == 1) assert(len(deployment_plan["phases"][4]["steps"]) == 1) # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so, # we wait for them to redeploy, but if they don't fail we still want to proceed. try: sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME, timeout_seconds=60) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=60) except TimeoutExpired: pass # test that the tasks are all up, which tests the overlay DNS framework_tasks = [task for task in shakedown.get_service_tasks(config.SERVICE_NAME, completed=False)] framework_task_names = [t["name"] for t in framework_tasks] for expected_task in EXPECTED_TASKS: assert(expected_task in framework_task_names), "Missing {expected}".format(expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys(), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys(), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network("hello-overlay-0-server") sdk_networks.check_task_network("hello-overlay-vip-0-server") sdk_networks.check_task_network("hello-host-0-server", expected_network_name=None) sdk_networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints', json=True) assert len(endpoints_result) == 2, "Wrong number of endpoints got {} should be 2".format(len(endpoints_result)) overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints overlay-vip', json=True) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host(config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints host-vip', json=True) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host(config.SERVICE_NAME, "hello-host-vip-0-server", 4044)