def test_config_update_while_partitioned(): world_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "world") partition_host = world_tasks[0].host sdk_agents.partition_agent(partition_host) service_config = sdk_marathon.get_config(config.SERVICE_NAME) updated_cpus = float(service_config["env"]["WORLD_CPUS"]) + 0.1 service_config["env"]["WORLD_CPUS"] = str(updated_cpus) sdk_marathon.update_app(service_config, wait_for_completed_deployment=False) sdk_agents.reconnect_agent(partition_host) # check that ALL the world tasks are updated after the agent reconnects: sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world", [t.id for t in world_tasks]) check_healthy() all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) running_tasks = [ t for t in all_tasks if t.name.startswith("world") and t.state == "TASK_RUNNING" ] assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def get_scheduler_host(service_name): task_prefix = get_scheduler_task_prefix(service_name) tasks = sdk_tasks.get_service_tasks("marathon", task_prefix=task_prefix) if len(tasks) == 0: raise Exception( "No marathon tasks starting with '{}' were found. Available tasks are: {}".format( task_prefix, [task["name"] for task in sdk_tasks.get_service_tasks("marathon")] ) ) return tasks.pop().host
def test_scheduler_task_placement_by_marathon(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) try: # This test ensures that the placement of the scheduler task itself works as expected. some_private_agent = sdk_agents.get_private_agents().pop()["hostname"] logging.info("Constraining scheduler placement to [{}]".format( some_private_agent)) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, expected_running_tasks=1, additional_options={ "service": { "constraints": [["hostname", "CLUSTER", "{}".format(some_private_agent)]], "yaml": "simple", } }, wait_for_deployment=False, ) summary = sdk_tasks.get_service_tasks("marathon", config.SERVICE_NAME) assert len( summary) == 1, "More than 1 task matched name [{}] : [{}]".format( config.SERVICE_NAME, summary) assert (some_private_agent == summary.pop().host ), "Scheduler task constraint placement failed by marathon" finally: sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
def test_unauthorized_users() -> None: tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "node-0")[0] _, stdout, stderr = sdk_cmd.run_cli( "task exec {} bash -c 'export JAVA_HOME=$(ls -d $MESOS_SANDBOX/jdk*/) ; export PATH=$MESOS_SANDBOX/python-dist/bin:$PATH ; export PATH=$(ls -d $MESOS_SANDBOX/apache-cassandra-*/bin):$PATH ; cqlsh -u dcossuperuser -p wrongpassword -e \"SHOW VERSION\" node-0-server.$FRAMEWORK_HOST $CASSANDRA_NATIVE_TRANSPORT_PORT' ".format( tasks.id ) )
def get_hello_world_agent_sets(): hello_agents = [] world_agents = [] for task in sdk_tasks.get_service_tasks(config.SERVICE_NAME): if task.name.startswith("hello-"): hello_agents.append(task.agent_id) elif task.name.startswith("world-"): world_agents.append(task.agent_id) else: assert False, "Unknown task: " + task.name return hello_agents, world_agents
def test_kill_data_node(): data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0] journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id]) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_world_executor(): world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="world-0")[0] sdk_cmd.kill_task_with_pattern( "mesos-default-executor", "nobody", agent_host=world_task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", [world_task.id]) check_healthy()
def test_kill_hello_task(): hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello-0")[0] sdk_cmd.kill_task_with_pattern( "hello-container-path/output", "nobody", agent_host=hello_task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [hello_task.id]) check_healthy()
def test_kill_all_executors(): tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) for task in tasks: sdk_cmd.kill_task_with_pattern( "mesos-default-executor", "nobody", agent_host=task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "", [task.id for task in tasks]) check_healthy()
def test_losing_and_regaining_index_health(default_populated_index: None) -> None: config.check_elasticsearch_index_health(index_name, "green", service_name=service_name) sdk_cmd.kill_task_with_pattern( "data__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(service_name, "data-0-node")[0].host, ) config.check_elasticsearch_index_health(index_name, "yellow", service_name=service_name) config.check_elasticsearch_index_health(index_name, "green", service_name=service_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_bump_hello_cpus(): hello_ids = sdk_tasks.get_task_ids(foldered_name, "hello") log.info("hello ids: " + str(hello_ids)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "hello", hello_ids) sdk_plan.wait_for_completed_deployment(foldered_name) all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="hello") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] assert len(running_tasks) == config.hello_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_bump_world_cpus(): original_world_ids = sdk_tasks.get_task_ids(foldered_name, "world") log.info("world ids: " + str(original_world_ids)) updated_cpus = config.bump_world_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "world", original_world_ids) sdk_plan.wait_for_completed_deployment(foldered_name) all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="world") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_master_reelection(): initial_master = config.get_elasticsearch_master(service_name=foldered_name) sdk_cmd.kill_task_with_pattern( "master__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(foldered_name, initial_master)[0].host, ) sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection() -> None: initial_master = config.get_elasticsearch_master(service_name=service_name) sdk_cmd.kill_task_with_pattern( "master__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(service_name, initial_master)[0].host, ) sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) config.wait_for_expected_nodes_to_exist(service_name=service_name) new_master = config.get_elasticsearch_master(service_name=service_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_bump_hello_cpus(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, "hello") log.info("hello ids: " + str(hello_ids)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "hello", hello_ids) config.check_running(foldered_name) all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="hello") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] assert len(running_tasks) == config.hello_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_integrity_on_data_node_failure(hdfs_client): """ Verifies proper data replication among data nodes. """ test_filename = config.get_unique_filename("test_datanode_fail") # An HDFS write will only successfully return when the data replication has taken place config.hdfs_client_write_data(test_filename) # Should have 3 data nodes (data-0,1,2), kill 2 of them: data_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "data") for idx in range(2): sdk_cmd.kill_task_with_pattern("DataNode", "nobody", agent_host=data_tasks[idx].host) config.hdfs_client_read_data(test_filename) config.check_healthy(config.SERVICE_NAME)
def test_integrity_on_name_node_failure(hdfs_client): """ The first name node (name-0-node) is the active name node by default when HDFS gets installed. This test checks that it is possible to write and read data after the active name node fails so as to verify a failover sustains expected functionality. """ @retrying.retry( wait_fixed=1000, stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000 ) def _get_active_name_node(): for candidate in ("name-0-node", "name-1-node"): if is_name_node_active(candidate): return candidate raise Exception("Failed to determine active name node") active_name_node = _get_active_name_node() sdk_cmd.kill_task_with_pattern( "NameNode", "nobody", agent_host=sdk_tasks.get_service_tasks(config.SERVICE_NAME, active_name_node)[0].host, ) # After the previous active namenode was killed, the opposite namenode should marked active: if active_name_node == "name-1-node": new_active_name_node = "name-0-node" else: new_active_name_node = "name-1-node" @retrying.retry( wait_fixed=1000, stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000, retry_on_result=lambda res: not res, ) def _wait_for_failover_to_complete(namenode): return is_name_node_active(namenode) _wait_for_failover_to_complete(new_active_name_node) test_filename = config.get_unique_filename("test_namenode_fail") config.hdfs_client_write_data(test_filename) config.hdfs_client_read_data(test_filename) config.check_healthy(config.SERVICE_NAME)
def test_losing_and_regaining_index_health(default_populated_index): config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name ) sdk_cmd.kill_task_with_pattern( "data__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(foldered_name, "data-0-node")[0].host, ) config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name ) config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name ) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_bump_hello_cpus(): def close_enough(val0, val1): epsilon = 0.00001 diff = abs(val0 - val1) return diff < epsilon config.check_running(config.SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello") log.info("hello ids: " + str(hello_ids)) updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", hello_ids) config.check_running(config.SERVICE_NAME) all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] for t in running_tasks: assert close_enough(t.resources["cpus"], updated_cpus)
def test_kill_essential(): """kill the essential task, verify that both tasks are relaunched against a matching executor""" verify_shared_executor("hello-0") old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0") assert len(old_tasks) == 2 # kill the essential task process. both tasks are on the same pod, so same host: sdk_cmd.kill_task_with_pattern( "shared-volume/essential", # hardcoded in cmd, see yml "nobody", agent_host=old_tasks[0].host, ) # wait for both task ids to change... sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [t.id for t in old_tasks]) # ...and for tasks to be up and running sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) # the first verify_shared_executor call deleted the files. both should have come back via the relaunch. verify_shared_executor("hello-0", delete_files=False) # leave files as-is for the next test
def test_kill_agent(): """kill the agent task, verify that the agent task is relaunched against the same executor as before""" verify_shared_executor("hello-0") old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0") assert len(old_tasks) == 2 old_node_task = [t for t in old_tasks if t.name == "hello-0-node"][0] old_agent_task = [t for t in old_tasks if t.name == "hello-0-agent"][0] sdk_cmd.kill_task_with_pattern( "agent-container-path/output", # hardcoded in cmd, see yml "nobody", agent_host=old_agent_task.host, ) sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-node", [old_node_task.id]) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-agent", [old_agent_task.id]) # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch. verify_shared_executor("hello-0")
def test_custom_jmx_port(): expected_open_port = ":7200 (LISTEN)" new_config = {"cassandra": {"jmx_port": 7200}} sdk_service.update_configuration( config.PACKAGE_NAME, config.get_foldered_service_name(), new_config, config.DEFAULT_TASK_COUNT, ) sdk_plan.wait_for_completed_deployment(config.get_foldered_service_name()) tasks = sdk_tasks.get_service_tasks(config.get_foldered_service_name(), "node") for task in tasks: _, stdout, _ = sdk_cmd.run_cli("task exec {} lsof -i :7200".format( task.id)) assert expected_open_port in stdout
def test_kill_nonessential(): """kill the nonessential task, verify that the nonessential task is relaunched against the same executor as before""" verify_shared_executor("hello-0") old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0") assert len(old_tasks) == 2 old_essential_task = [t for t in old_tasks if t.name == "hello-0-essential"][0] old_nonessential_task = [t for t in old_tasks if t.name == "hello-0-nonessential"][0] # kill the nonessential task process. both tasks are in the same pod, so same host: sdk_cmd.kill_task_with_pattern( "shared-volume/nonessential", # hardcoded in cmd, see yml "nobody", agent_host=old_nonessential_task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-nonessential", [old_nonessential_task.id]) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-essential", [old_essential_task.id]) # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch. verify_shared_executor("hello-0", expected_files=["nonessential"])
def test_scheduler_task_placement_by_marathon(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) try: # This test ensures that the placement of the scheduler task itself works as expected. some_private_agent = sdk_agents.get_private_agents().pop()["hostname"] logging.info("Constraining scheduler placement to [{}]".format(some_private_agent)) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, expected_running_tasks=1, additional_options={ "service": { "constraints": [["hostname", "CLUSTER", "{}".format(some_private_agent)]], "yaml": "simple" } }, wait_for_deployment=False, ) summary = sdk_tasks.get_service_tasks("marathon", config.SERVICE_NAME) assert len(summary) == 1, "More than 1 task matched name [{}] : [{}]".format(config.SERVICE_NAME, summary) assert some_private_agent == summary.pop().host, "Scheduler task constraint placement failed by marathon" finally: sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
def test_unauthorized_users() -> None: tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "node-0")[0] _, stdout, stderr = sdk_cmd.run_cli("task exec {} bash -c 'export JAVA_HOME=$(ls -d $MESOS_SANDBOX/jdk*/jre/) ; export PATH=$MESOS_SANDBOX/python-dist/bin:$PATH ; export PATH=$(ls -d $MESOS_SANDBOX/apache-cassandra-*/bin):$PATH ; cqlsh -u dcossuperuser -p wrongpassword -e \"SHOW VERSION\" node-0-server.$FRAMEWORK_HOST $CASSANDRA_NATIVE_TRANSPORT_PORT' ".format(tasks.id)) assert "Provided username dcossuperuser and/or password are incorrect" in stderr
def test_xmx_and_xms_flags(configure_security): """ method to test the duplication of JVM flags in elastic tasks """ # setting custom values for the heap of various pods MASTER_NODE_HEAP = 700 DATA_NODE_HEAP = 800 COORDINATOR_NODE_HEAP = 900 INGEST_NODE_HEAP = 1000 # installing elastic service and passing customized json to overwrite default values. sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, { "master_nodes": {"heap": {"size": MASTER_NODE_HEAP}}, "data_nodes": {"heap": {"size": DATA_NODE_HEAP}}, "coordinator_nodes": {"heap": {"size": COORDINATOR_NODE_HEAP}}, "ingest_nodes": {"heap": {"size": INGEST_NODE_HEAP}}, }, ) # getting all the tasks and checking the flag duplicacy by running curl_cmd command. for task in sdk_tasks.get_service_tasks(config.SERVICE_NAME): cmd = "ps aux" flag_xms = "Xms" flag_xmx = "Xmx" exit_code, stdout, stderr = sdk_cmd.service_task_exec(config.SERVICE_NAME, task.name, cmd) if not str(task).count("exporter"): assert str(stdout).count(flag_xms) == 1, "Default xms flag prefix should appear once" assert str(stdout).count(flag_xmx) == 1, "Default xmx flag prefix should appear once" if str(task).count("master"): master_xms = flag_xms + str(MASTER_NODE_HEAP) master_xmx = flag_xmx + str(MASTER_NODE_HEAP) log.info("Checking flags in master node: " + task.name) assert ( str(stdout).count(master_xms) == 1 ), "Configured master node xms flag prefix should appear once" assert ( str(stdout).count(master_xmx) == 1 ), "Configured master node xmx flag prefix should appear once" if str(task).count("data"): data_xms = flag_xms + str(DATA_NODE_HEAP) data_xmx = flag_xmx + str(DATA_NODE_HEAP) log.info("Checking flags in data node: " + task.name) assert ( str(stdout).count(data_xms) == 1 ), "Configured data node xms flag prefix should appear once" assert ( str(stdout).count(data_xmx) == 1 ), "Configured data node xmx flag prefix should appear once" if str(task).count("coordinator"): coordinator_xms = flag_xms + str(COORDINATOR_NODE_HEAP) coordinator_xmx = flag_xmx + str(COORDINATOR_NODE_HEAP) log.info("Checking flags in coordinator node: " + task.name) assert ( str(stdout).count(coordinator_xms) == 1 ), "Configured coordinator node xms flag prefix should appear once" assert ( str(stdout).count(coordinator_xmx) == 1 ), "Configured coordinator node xmx flag prefix should appear once" if str(task).count("ingest"): ingest_xms = flag_xms + str(INGEST_NODE_HEAP) ingest_xmx = flag_xmx + str(INGEST_NODE_HEAP) log.info("Checking flags in ingest node: " + task.name) assert ( str(stdout).count(ingest_xms) == 1 ), "Configured ingest node flag xms prefix should appear once" assert ( str(stdout).count(ingest_xmx) == 1 ), "Configured ingest node flag xmx prefix should appear once"
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment( config.SERVICE_NAME) log.info(sdk_plan.plan_string("deploy", deployment_plan)) # test that the tasks are all up, which tests the overlay DNS framework_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) expected_running_tasks = [ "overlay-vip-0-server", "overlay-0-server", "host-vip-0-server", "host-0-server", ] assert set(expected_running_tasks) == set( [t.name for t in framework_tasks]) for task in framework_tasks: name = task.name if name.startswith("host-"): assert "ports" in task.resources.keys( ), "Task {} should have port resources".format(name) sdk_networks.check_task_network(name, expected_network_name=None) elif name.startswith("overlay-"): assert ("ports" not in task.resources.keys() ), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network(name) else: assert False, "Unknown task {}".format(name) endpoints_result = sdk_networks.get_endpoint_names(config.PACKAGE_NAME, config.SERVICE_NAME) assert len(endpoints_result) == 2, "Expected 2 endpoints, got: {}".format( endpoints_result) overlay_endpoints_result = sdk_networks.get_endpoint( config.PACKAGE_NAME, config.SERVICE_NAME, "overlay-vip") assert ("address" in overlay_endpoints_result.keys() ), "overlay endpoints missing 'address': {}".format( overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "overlay-vip-0-server", 4044) host_endpoints_result = sdk_networks.get_endpoint(config.PACKAGE_NAME, config.SERVICE_NAME, "host-vip") assert ( "address" in host_endpoints_result.keys() ), "overlay endpoints missing 'address'" "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "host-vip-0-server", 4044)
def check_health_check_logs(): broker = None executor_path = None health_check = {"success": 0, "faliure": 0} # Get list of all tasks for kafka service tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) # Get any 0th broker for t in tasks: if t.state == "TASK_RUNNING" and t.name == "kafka-0-broker": broker = t break if not broker: raise Exception("Broker 0 is not in status: TASK_RUNNING!") # Get Mesos Executor path for Broker 0 cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json() agent_executor_paths = sdk_cmd.cluster_request( "GET", "/slave/{}/files/debug".format(broker.agent_id) ).json() for cluster_task in cluster_tasks["tasks"]: if cluster_task["slave_id"] == broker.agent_id: executor_path = sdk_diag._find_matching_executor_path( agent_executor_paths, sdk_diag._TaskEntry(cluster_task) ) break if not executor_path: raise Exception("Executor path not found!") # Get path of executor's stderr file file_infos = sdk_cmd.cluster_request( "GET", "/slave/{}/files/browse?path={}".format(broker.agent_id, executor_path) ).json() file_info = None for _t in file_infos: if _t["path"].endswith("/stderr"): file_info = _t break if not file_info: raise Exception("Executor stderr file not found!") # Download stderr file stderr_log = None try: stderr_log = tempfile.NamedTemporaryFile(mode="wb", delete=False) stream = sdk_cmd.cluster_request( "GET", "/slave/{}/files/download?path={}".format(broker.agent_id, file_info["path"]) ) with stderr_log as f: for chunk in stream.iter_content(chunk_size=8192): f.write(chunk) except Exception: log.exception( "Failed to get file: {} from agent: {}".format(file_info["path"], broker.agent_id)) # Check stderr for health check output try: with open(stderr_log.name) as f: for line in f: if re.match("^(Health check passed)", line): health_check["success"] += 1 if re.match("^(Health check failed)", line): health_check["faliure"] += 1 except Exception: log.exception("Failed to read downloaded executor stderr file: {} from agent: {}".format(file_info["path"], broker.agent_id)) log.info("HEALTH CHECK success:{} faliure:{}".format(health_check["success"], health_check["faliure"])) assert health_check["success"] > health_check["faliure"]
def get_metrics(package_name: str, service_name: str, pod_name: str, task_name: str) -> List: """Return a list of DC/OS metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ # Find task entry in mesos state: tasks = sdk_tasks.get_service_tasks(service_name) for task in tasks: if task.name == task_name: task_to_check = task break if task_to_check is None: raise Exception( "Task named {} not found in service {}: {}".format(task_name, service_name, tasks) ) # Find task's container id via recent TaskStatus: rc, stdout, _ = sdk_cmd.svc_cli( package_name, service_name, "pod info {}".format(pod_name), print_output=False ) assert rc == 0, "Pod info failed" pod_info = json.loads(stdout) task_container_id = None for task in pod_info: if task["info"]["name"] == task_name: task_container_id = task["status"]["containerStatus"]["containerId"]["value"] break if task_container_id is None: log.warning("Task named {} not found in pod {}: {}".format(task_name, pod_name, pod_info)) return [] # Not related to functionality, but consuming this endpoint to verify metrics integrity containers_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers".format(task_to_check.agent_id), retry=False, ) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True break if not container_id_reported: raise ValueError( "The metrics /container endpoint returned {} for agent {}, expected {} to be returned as well".format( reported_container_ids, task_to_check.agent_id, task_container_id ) ) app_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format( task_to_check.agent_id, task_container_id ), retry=False, ) app_response.raise_for_status() app_json = app_response.json() if "dimensions" not in app_json: log.error("Expected key '%s' not found in app metrics: %s", "dimensions", app_json) raise Exception("Expected key 'dimensions' not found in app metrics") if "task_name" not in app_json["dimensions"]: log.error( "Expected key '%s' not found in app metrics: %s", "dimensions.task_name", app_json ) raise Exception("Expected key 'dimensions.task_name' not found in app metrics") if app_json["dimensions"]["task_name"] == task_name: return list(app_json["datapoints"]) raise Exception("No metrics found for task {} in service {}".format(task_name, service_name))
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) log.info(sdk_plan.plan_string("deploy", deployment_plan)) # test that the tasks are all up, which tests the overlay DNS framework_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME) expected_running_tasks = [ "overlay-vip-0-server", "overlay-0-server", "host-vip-0-server", "host-0-server" ] assert set(expected_running_tasks) == set([t.name for t in framework_tasks]) for task in framework_tasks: name = task.name if name.startswith("host-"): assert "ports" in task.resources.keys(), "Task {} should have port resources".format( name ) sdk_networks.check_task_network(name, expected_network_name=None) elif name.startswith("overlay-"): assert ( "ports" not in task.resources.keys() ), "Task {} should NOT have port resources".format( name ) sdk_networks.check_task_network(name) else: assert False, "Unknown task {}".format(name) endpoints_result = sdk_networks.get_endpoint_names(config.PACKAGE_NAME, config.SERVICE_NAME) assert len(endpoints_result) == 2, "Expected 2 endpoints, got: {}".format(endpoints_result) overlay_endpoints_result = sdk_networks.get_endpoint( config.PACKAGE_NAME, config.SERVICE_NAME, "overlay-vip" ) assert "address" in overlay_endpoints_result.keys(), ( "overlay endpoints missing 'address': {}".format(overlay_endpoints_result) ) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "overlay-vip-0-server", 4044 ) host_endpoints_result = sdk_networks.get_endpoint( config.PACKAGE_NAME, config.SERVICE_NAME, "host-vip" ) assert "address" in host_endpoints_result.keys(), ( "overlay endpoints missing 'address'" "{}".format(host_endpoints_result) ) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "host-vip-0-server", 4044 )