def get_metrics(package_name, service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] # TODO: uncomment the following block of comments when the /containers endpoint reports the correct container IDs # and remove the code following the comments that gets the correct container ID via 'pod info' ## Fetch the list of containers for the agent #containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format(shakedown.dcos_url(), agent_id) #containers_response = sdk_cmd.request("GET", containers_url, retry=False) #if containers_response.ok is None: # log.info("Unable to fetch containers list") # raise Exception("Unable to fetch containers list: {}".format(containers_url)) # instead of receiving the pod name in this function's parameter list, extract # the name of the pod from the task name to not break the code when the # above comment-block is uncommented pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] container_id = task_info["status"]["containerStatus"]["containerId"]["value"] #for container_id in json.loads(containers_response.text): app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, container_id) app_response = sdk_cmd.request("GET", app_url, retry=False) if app_response.ok is None: raise("Failed to get metrics from container") #continue app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def get_scheduler_host(service_name): # Marathon mangles foldered paths as follows: "/path/to/svc" => "svc.to.path" task_name_elems = service_name.lstrip('/').split('/') task_name_elems.reverse() app_name = '.'.join(task_name_elems) ips = shakedown.get_service_ips('marathon', app_name) if len(ips) == 0: raise Exception('No IPs found for marathon task "{}". Available tasks are: {}'.format( app_name, [task['name'] for task in shakedown.get_service_tasks('marathon')])) return ips.pop()
def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False
def get_metrics(package_name, service_name, task_name): """Return a list of DC/OS metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] task_container_id = task_info["status"]["containerStatus"]["containerId"]["value"] # Not related to functionality but consuming this # endpoint to verify downstream integrity containers_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers".format(agent_id), retry=False) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True if not container_id_reported: raise ValueError("The metrics /container endpoint returned {}, expecting {} to be returned as well".format( reported_container_ids, task_container_id)) app_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(agent_id, task_container_id), retry=False) app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def get_hello_world_agent_sets(): all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) hello_agents = [] world_agents = [] for task in all_tasks: if task['name'].startswith('hello-'): hello_agents.append(task['slave_id']) elif task['name'].startswith('world-'): world_agents.append(task['slave_id']) else: assert False, "Unknown task: " + task['name'] return hello_agents, world_agents
def test_no_colocation_in_podtypes(): # check that no two 'hellos' and no two 'worlds' are colocated on the same agent all_tasks = shakedown.get_service_tasks(PACKAGE_NAME) print(all_tasks) hello_agents = [] world_agents = [] for task in all_tasks: if task['name'].startswith('hello-'): hello_agents.append(task['slave_id']) elif task['name'].startswith('world-'): world_agents.append(task['slave_id']) else: assert False, "Unknown task: " + task['name'] assert len(hello_agents) == len(set(hello_agents)) assert len(world_agents) == len(set(world_agents))
def test_bump_world_cpus(): config.check_running(FOLDERED_SERVICE_NAME) world_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'world') log.info('world ids: ' + str(world_ids)) updated_cpus = config.bump_world_cpus(FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'world', world_ids) config.check_running(FOLDERED_SERVICE_NAME) all_tasks = shakedown.get_service_tasks(FOLDERED_SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(FOLDERED_SERVICE_NAME) for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def get_job_tasks(job_id, run_id): client = metronome.create_client() run = client.get_run(job_id, run_id) taskids = [] for task in run['tasks']: taskids.append(task['id']) job_tasks = [] all_job_tasks = shakedown.get_service_tasks('metronome') for task in all_job_tasks: for taskid in taskids: if taskid == task['id']: job_tasks.append(task) return job_tasks
def test_bump_hello_cpus(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') log.info('hello ids: ' + str(hello_ids)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'hello', hello_ids) config.check_running(foldered_name) all_tasks = shakedown.get_service_tasks(foldered_name) running_tasks = [t for t in all_tasks if t['name'].startswith('hello') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.hello_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t['resources']['cpus'], updated_cpus)
def test_config_update_while_partitioned(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_hosts.system_host(config.SERVICE_NAME, "world-0-server") shakedown.partition_agent(host) service_config = sdk_marathon.get_config(config.SERVICE_NAME) updated_cpus = float(service_config['env']['WORLD_CPUS']) + 0.1 service_config['env']['WORLD_CPUS'] = str(updated_cpus) sdk_marathon.update_app(config.SERVICE_NAME, service_config, wait_for_completed_deployment=False) shakedown.reconnect_agent(host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running() all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME) for t in running_tasks: assert config.close_enough(t['resources']['cpus'], updated_cpus)
def test_bump_hello_cpus(): def close_enough(val0, val1): epsilon = 0.00001 diff = abs(val0 - val1) return diff < epsilon config.check_running(config.SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello') log.info('hello ids: ' + str(hello_ids)) updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', hello_ids) config.check_running(config.SERVICE_NAME) all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('hello') and t['state'] == "TASK_RUNNING"] for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def fn(): try: tasks = shakedown.get_service_tasks(service_name) except dcos.errors.DCOSHTTPException: print('Failed to get tasks for service {}'.format(service_name)) tasks = [] running_task_names = [] other_tasks = [] for t in tasks: if t['state'] == 'TASK_RUNNING': running_task_names.append(t['name']) else: other_tasks.append('{}={}'.format(t['name'], t['state'])) print('Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format( expected_task_count, len(running_task_names), len(tasks), running_task_names, other_tasks)) return len(running_task_names) >= expected_task_count
def test_bump_world_cpus(): check_running() world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') print('world ids: ' + str(world_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['WORLD_CPUS']) updated_cpus = cpus + 0.1 config['env']['WORLD_CPUS'] = str(updated_cpus) marathon.update_app(PACKAGE_NAME, config) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running() all_tasks = shakedown.get_service_tasks(PACKAGE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == world_task_count() for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def test_bump_world_cpus(): check_running() world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') print('world ids: ' + str(world_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['WORLD_CPUS']) updated_cpus = cpus + 0.1 config['env']['WORLD_CPUS'] = str(updated_cpus) marathon.update_app(PACKAGE_NAME, config) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running() all_tasks = shakedown.get_service_tasks(PACKAGE_NAME) running_tasks = [ t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING" ] assert len(running_tasks) == world_task_count() for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def _submit_job_and_verify_users(user, use_ucr_for_spark_submit, extra_args=[]): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] + extra_args driver_task_id = utils.submit_job(service_name=SERVICE_NAME, app_url=utils.dcos_test_jar_url(), app_args="1 300", args=submit_args) try: sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_task = shakedown.get_task(driver_task_id, completed=False) executor_tasks = shakedown.get_service_tasks(app_name) for task in [driver_task] + executor_tasks: log.info(f"Checking task '{task['id']}'") _check_task_user(task, user, use_ucr_for_spark_submit) finally: log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}") utils.kill_driver(driver_task_id, service_name=SERVICE_NAME)
def get_metrics(service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] # Fetch the list of containers for the agent containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format( shakedown.dcos_url(), agent_id) containers_response = cmd.request("GET", containers_url, retry=False) if containers_response.ok is None: log.info("Unable to fetch containers list") raise Exception( "Unable to fetch containers list: {}".format(containers_url)) for container in json.loads(containers_response.text): app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, container) app_response = cmd.request("GET", app_url, retry=False) if app_response.ok is None: continue app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def test_config_update_while_partitioned(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_hosts.system_host(config.SERVICE_NAME, "world-0-server") shakedown.partition_agent(host) service_config = sdk_marathon.get_config(config.SERVICE_NAME) updated_cpus = float(service_config['env']['WORLD_CPUS']) + 0.1 service_config['env']['WORLD_CPUS'] = str(updated_cpus) sdk_marathon.update_app(config.SERVICE_NAME, service_config, wait_for_completed_deployment=False) shakedown.reconnect_agent(host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running() all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) running_tasks = [ t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING" ] assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME) for t in running_tasks: assert config.close_enough(t['resources']['cpus'], updated_cpus)
def test_shuffle_job(submit_args=[], use_ucr_for_spark_submit=True, use_cli_for_spark_submit=True, check_network_labels=False): if use_ucr_for_spark_submit: submit_args = submit_args + ["--conf spark.mesos.containerizer=mesos"] driver_task_id = _submit_shuffle_job(use_cli=use_cli_for_spark_submit, sleep=300, extra_args=submit_args) sdk_tasks.check_running(SHUFFLE_JOB_FW_NAME, SHUFFLE_JOB_NUM_EXECUTORS, timeout_seconds=600) driver_task = shakedown.get_task(driver_task_id, completed=False) _check_task_network(driver_task, is_ucr=use_ucr_for_spark_submit) if check_network_labels and use_ucr_for_spark_submit: _check_task_network_labels(driver_task) executor_tasks = shakedown.get_service_tasks(SHUFFLE_JOB_FW_NAME) for task in executor_tasks: _check_task_network(task, is_ucr=use_ucr_for_spark_submit) if check_network_labels and use_ucr_for_spark_submit: _check_task_network_labels(task) try: utils.wait_for_running_job_output( driver_task_id, "Groups count: {}".format(SHUFFLE_JOB_EXPECTED_GROUPS_COUNT)) finally: log.info("Cleaning up. Attempting to kill driver: {}".format( driver_task_id)) utils.kill_driver(driver_task_id, service_name=CNI_DISPATCHER_SERVICE_NAME)
def tasks_running_success_predicate(task_count): tasks = shakedown.get_service_tasks(PACKAGE_NAME) running_tasks = [t for t in tasks if t['state'] == TASK_RUNNING_STATE] print('Waiting for {} healthy tasks, got {}/{}'.format( task_count, len(running_tasks), len(tasks))) return len(running_tasks) == task_count
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = plan.wait_for_completed_deployment(PACKAGE_NAME) utils.out("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert (len(deployment_plan['phases']) == 5) assert (deployment_plan['phases'][0]['name'] == 'hello-overlay-vip-deploy') assert (deployment_plan['phases'][1]['name'] == 'hello-overlay-deploy') assert (deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert (deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert (deployment_plan["phases"][4]["name"] == "getter-deploy") assert (len(deployment_plan['phases'][0]['steps']) == 1) assert (len(deployment_plan["phases"][1]["steps"]) == 1) assert (len(deployment_plan["phases"][2]["steps"]) == 1) assert (len(deployment_plan["phases"][3]["steps"]) == 1) assert (len(deployment_plan["phases"][4]["steps"]) == 4) # test that the tasks are all up, which tests the overlay DNS framework_tasks = [ task for task in shakedown.get_service_tasks(PACKAGE_NAME, completed=False) ] framework_task_names = [t["name"] for t in framework_tasks] expected_tasks = [ 'getter-0-get-host', 'getter-0-get-overlay', 'getter-0-get-overlay-vip', 'getter-0-get-host-vip', 'hello-host-vip-0-server', 'hello-overlay-vip-0-server', 'hello-host-0-server', 'hello-overlay-0-server' ] for expected_task in expected_tasks: assert (expected_task in framework_task_names), "Missing {expected}".format( expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys( ), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys( ), "Task {} should NOT have port resources".format(name) networks.check_task_network("hello-overlay-0-server") networks.check_task_network("hello-overlay-vip-0-server") networks.check_task_network("hello-host-0-server", expected_network_name=None) networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result, _, rc = shakedown.run_dcos_command( "{pkg} endpoints".format(pkg=PACKAGE_NAME)) endpoints_result = json.loads(endpoints_result) assert rc == 0, "Getting endpoints failed" assert len(endpoints_result ) == 2, "Wrong number of endpoints got {} should be 2".format( len(endpoints_result)) overlay_endpoints_result, _, rc = shakedown.run_dcos_command( "{pkg} endpoints overlay-vip".format(pkg=PACKAGE_NAME)) assert rc == 0, "Getting overlay endpoints failed" overlay_endpoints_result = json.loads(overlay_endpoints_result) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == hosts.autoip_host( PACKAGE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result, _, rc = shakedown.run_dcos_command( "{pkg} endpoints host-vip".format(pkg=PACKAGE_NAME)) assert rc == 0, "Getting host endpoints failed" host_endpoints_result = json.loads(host_endpoints_result) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == hosts.autoip_host( PACKAGE_NAME, "hello-host-vip-0-server", 4044)
def get_task_ids(prefix): tasks = shakedown.get_service_tasks(PACKAGE_NAME) prefixed_tasks = [t for t in tasks if t['name'].startswith(prefix)] task_ids = [t['id'] for t in prefixed_tasks] return task_ids
def get_task_ids(): tasks = shakedown.get_service_tasks(PACKAGE_NAME) return [t['id'] for t in tasks]
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment( config.SERVICE_NAME) log.info("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert (len(deployment_plan['phases']) == 5) assert (deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy') assert (deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy') assert (deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert (deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert (deployment_plan["phases"][4]["name"] == "getter-deploy") assert (len(deployment_plan['phases'][0]['steps']) == 1) assert (len(deployment_plan["phases"][1]["steps"]) == 1) assert (len(deployment_plan["phases"][2]["steps"]) == 1) assert (len(deployment_plan["phases"][3]["steps"]) == 1) assert (len(deployment_plan["phases"][4]["steps"]) == 1) # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so, # we wait for them to redeploy, but if they don't fail we still want to proceed. try: sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME, timeout_seconds=60) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=60) except retrying.RetryError: pass # test that the tasks are all up, which tests the overlay DNS framework_tasks = [ task for task in shakedown.get_service_tasks(config.SERVICE_NAME, completed=False) ] framework_task_names = [t["name"] for t in framework_tasks] for expected_task in EXPECTED_TASKS: assert (expected_task in framework_task_names), "Missing {expected}".format( expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys( ), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys( ), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network("hello-overlay-0-server") sdk_networks.check_task_network("hello-overlay-vip-0-server") sdk_networks.check_task_network("hello-host-0-server", expected_network_name=None) sdk_networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints', json=True) assert len(endpoints_result ) == 2, "Wrong number of endpoints got {} should be 2".format( len(endpoints_result)) overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints overlay-vip', json=True) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints host-vip', json=True) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-host-vip-0-server", 4044)
def fn(): try: tasks = shakedown.get_service_tasks(PACKAGE_NAME) return [t for t in tasks if t['state'] == TASK_RUNNING_STATE and t['name'] == broker_name] except dcos.errors.DCOSHTTPException: return []
def _is_hdfs_ready(expected_tasks=DEFAULT_HDFS_TASK_COUNT): running_tasks = [t for t in shakedown.get_service_tasks(HDFS_SERVICE_NAME) \ if t['state'] == 'TASK_RUNNING'] return len(running_tasks) >= expected_tasks
def fn(): try: return shakedown.get_service_tasks(PACKAGE_NAME) except dcos.errors.DCOSHTTPException: return []
def get_metrics(package_name, service_name, task_name): """Return a list of metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] task_container_id = task_info["status"]["containerStatus"]["containerId"][ "value"] # Not related to functionality but consuming this # endpoint to verify downstream integrity containers_url = "{}/system/v1/agent/{}/metrics/v0/containers".format( shakedown.dcos_url(), agent_id) containers_response = sdk_cmd.request("GET", containers_url, retry=False) if containers_response.ok is None: log.info("Unable to fetch containers list") raise Exception( "Unable to fetch containers list: {}".format(containers_url)) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True if not container_id_reported: raise ValueError( "The metrics /container endpoint returned {}, expecting {} to be returned as well" .format(reported_container_ids, task_container_id)) app_url = "{}/system/v1/agent/{}/metrics/v0/containers/{}/app".format( shakedown.dcos_url(), agent_id, task_container_id) app_response = sdk_cmd.request("GET", app_url, retry=False) if app_response.ok is None: raise ValueError("Failed to get metrics from container") app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def fn(): return shakedown.get_service_tasks(PACKAGE_NAME)
def get_task_ids(service_name, task_prefix): tasks = shakedown.get_service_tasks(service_name) matching_tasks = [t for t in tasks if t['name'].startswith(task_prefix)] return [t['id'] for t in matching_tasks]
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) log.info("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert(len(deployment_plan['phases']) == 5) assert(deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy') assert(deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy') assert(deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert(deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert(deployment_plan["phases"][4]["name"] == "getter-deploy") assert(len(deployment_plan['phases'][0]['steps']) == 1) assert(len(deployment_plan["phases"][1]["steps"]) == 1) assert(len(deployment_plan["phases"][2]["steps"]) == 1) assert(len(deployment_plan["phases"][3]["steps"]) == 1) assert(len(deployment_plan["phases"][4]["steps"]) == 1) # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so, # we wait for them to redeploy, but if they don't fail we still want to proceed. try: sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME, timeout_seconds=60) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=60) except TimeoutExpired: pass # test that the tasks are all up, which tests the overlay DNS framework_tasks = [task for task in shakedown.get_service_tasks(config.SERVICE_NAME, completed=False)] framework_task_names = [t["name"] for t in framework_tasks] for expected_task in EXPECTED_TASKS: assert(expected_task in framework_task_names), "Missing {expected}".format(expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys(), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys(), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network("hello-overlay-0-server") sdk_networks.check_task_network("hello-overlay-vip-0-server") sdk_networks.check_task_network("hello-host-0-server", expected_network_name=None) sdk_networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints', json=True) assert len(endpoints_result) == 2, "Wrong number of endpoints got {} should be 2".format(len(endpoints_result)) overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints overlay-vip', json=True) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host(config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints host-vip', json=True) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host(config.SERVICE_NAME, "hello-host-vip-0-server", 4044)
def is_service_ready(service_name, expected_tasks): running_tasks = [t for t in shakedown.get_service_tasks(service_name) \ if t['state'] == 'TASK_RUNNING'] LOGGER.info("Waiting for {n} tasks got {m} for service {s}".format( n=expected_tasks, m=len(running_tasks), s=service_name)) return len(running_tasks) >= expected_tasks