def test_pod_health_failed_check(): """Deploys a pod with correct health checks, then partitions the network and verifies that the tasks get restarted with new task IDs. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] common.block_iptable_rules_for_seconds(host, port, 7, block_input=True, block_output=False) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for new_task in tasks: new_task_id = new_task['id'] assert new_task_id != initial_id1, f"Task {new_task_id} has not been restarted" # NOQA E999 assert new_task_id != initial_id2, f"Task {new_task_id} has not been restarted"
def test_pod_with_container_network(): """Tests creation of a pod with a "container" network, and its HTTP endpoint accessibility.""" pod_def = pods.container_net_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if shakedown.dcos.cluster.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx") network_info = common.running_status_network_info(task['statuses']) assert network_info['name'] == "dcos", \ "The network name is {}, but 'dcos' was expected".format(network_info['name']) container_ip = network_info['ip_addresses'][0]['ip_address'] assert container_ip is not None, "No IP address has been assigned to the pod's container" url = "http://{}:80/".format(container_ip) common.assert_http_code(url)
def test_pod_with_container_bridge_network(): """Tests creation of a pod with a "container/bridge" network, and its HTTP endpoint accessibility.""" pod_def = pods.container_bridge_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if shakedown.dcos.cluster.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx") network_info = common.running_status_network_info(task['statuses']) assert network_info['name'] == "mesos-bridge", \ "The network is {}, but mesos-bridge was expected".format(network_info['name']) # get the port on the host port = task['discovery']['ports']['ports'][0]['number'] # the agent IP:port will be routed to the bridge IP:port # test against the agent_ip, however it is hard to get.. translating from # slave_id agent_ip = common.agent_hostname_by_id(task['slave_id']) assert agent_ip is not None, "Failed to get the agent IP address" container_ip = network_info['ip_addresses'][0]['ip_address'] assert agent_ip != container_ip, "The container IP address is the same as the agent one" url = "http://{}:{}/".format(agent_ip, port) common.assert_http_code(url)
async def test_event_channel_for_pods(sse_events): """Tests the Marathon event channel specific to pod events.""" await common.assert_event('event_stream_attached', sse_events) pod_def = pods.simple_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write files. if shakedown.dcos.cluster.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) await common.assert_event('pod_created_event', sse_events) await common.assert_event('deployment_step_success', sse_events) pod_def["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_def) deployment_wait(service_id=pod_id) await common.assert_event('pod_updated_event', sse_events)
def assert_mom_ee(version, security_mode='permissive'): ensure_service_account() ensure_permissions() ensure_sa_secret(strict=True if security_mode == 'strict' else False) ensure_docker_config_secret() # In strict mode all tasks are started as user `nobody` by default. However we start # MoM-EE as 'root' and for that we need to give root marathon ACLs to start # tasks as 'root'. if security_mode == 'strict': common.add_dcos_marathon_user_acls() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile( app_def_file ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) logger.info('Deploying {} definition with {} image'.format( app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker'][ 'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) shakedown.dcos.service.wait_for_service_endpoint(mom_ee_endpoint( version, security_mode), path="ping")
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pod_with_persistent_volume(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) host = common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] # Container with the name 'container1' appends its taskId to the file. So we search for the # taskId of that container which is not always the tasks[0] expected_data = next((t['id'] for t in tasks if t['name'] == 'container1'), None) assert expected_data, f"Hasn't found a container with the name 'container1' in the pod {tasks}" port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] logger.info('Deployd two containers on {}:{}/{} and {}:{}/{}'.format(host, port1, path1, host, port2, path2)) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def check_http_endpoint(port, path, expected): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert expected in data, "'{}' was not found in '{}'".format(data, expected) check_http_endpoint(port1, path1, expected_data) check_http_endpoint(port2, path2, expected_data)
def test_vip_mesos_cmd(marathon_service_name): """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP.""" app_def = apps.http_server() app_id = app_def["id"] vip_name = app_id.lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['portDefinitions'] = [{ "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } }] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() logger.info('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len( deployments ) == 1, "The number of deployments is {}, but 1 was expected".format( len(deployments)) assert len( tasks) == 1, "The number of tasks is {}, but 1 was expected".format( len(tasks))
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_id = app_def["id"] app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_id) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition common.block_iptable_rules_for_seconds(host, port, sleep_seconds=10, block_input=True, block_output=False) deployment_wait(service_id=app_id) app = client.get_app(app_id) tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format( task_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_pod_with_persistent_volume(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) host = common.running_status_network_info( tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] logger.info('Deployd two containers on {}:{}/{} and {}:{}/{}'.format( host, port1, path1, host, port2, path2)) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def check_http_endpoint(port, path): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) check_http_endpoint(port1, path1) check_http_endpoint(port2, path2)
def test_pod_restarts_on_nonzero_exit_code(): """Verifies that a pod get restarted in case one of its containers exits with a non-zero code. As a result, after restart, there should be two new tasks for different IDs. """ pod_def = pods.simple_pod() pod_id = pod_def['id'] pod_def["scaling"]["instances"] = 1 pod_def['containers'][0]['exec']['command'][ 'shell'] = 'sleep 5; echo -n leaving; exit 2' client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] time.sleep( 6) # 1 sec past the 5 sec sleep in one of the container's command tasks = common.get_pod_tasks(pod_id) for task in tasks: assert task['id'] != initial_id1, "Got the same task ID" assert task['id'] != initial_id2, "Got the same task ID"
def test_two_pods_with_shared_volume(): """Confirms that 1 container can read data in a volume that was written from the other container. The reading container fails if it can't read the file. So if there are 2 tasks after 4 seconds we are good. """ pod_def = pods.ephemeral_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format( len(tasks)) time.sleep(4) tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of tasks is {} after sleeping, but 2 was expected".format( len(tasks))
def test_create_and_update_pod(): """Versions and reverting with pods""" pod_def = pods.simple_pod() pod_def["scaling"]["instances"] = 1 pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) pod_def["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_def) deployment_wait(service_id=pod_id) versions = get_pod_versions(pod_id) assert len( versions ) == 2, "The number of versions is {}, but 2 was expected".format( len(versions)) version1 = get_pod_version(pod_id, versions[0]) version2 = get_pod_version(pod_id, versions[1]) assert version1["scaling"]["instances"] != version2["scaling"]["instances"], \ "Two pod versions have the same number of instances: {}, but they should not".format( version1["scaling"]["instances"])
def test_create_pod_with_private_image(): """Deploys a pod with a private Docker image, using Mesos containerizer. This method relies on the global `install_enterprise_cli` fixture to install the enterprise-cli-package. """ username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json( username, password) secret_value = json.dumps(secret_value_json) pod_def = pods.private_docker_pod() pod_id = pod_def['id'] common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_pod(pod_def) deployment_wait(service_id=pod_id, max_attempts=300) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created" finally: common.delete_secret(secret_name)
def test_pinned_task_scales_on_host_only(): """Tests that a pinned app scales only on the pinned node.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) assert tasks[0]['host'] == host, \ "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host) client.scale_app(app_id, 10) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 10, "The number of tasks is {} after scale, but 10 was expected".format( len(tasks)) for task in tasks: assert task[ 'host'] == host, "The task is on {}, but it is supposed to be on {}".format( task['host'], host)
def test_pinned_task_scales_on_host_only(): """Tests that a pinned app scales only on the pinned node.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) assert tasks[0]['host'] == host, \ "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host) client.scale_app(app_id, 10) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 10, "The number of tasks is {} after scale, but 10 was expected".format(len(tasks)) for task in tasks: assert task['host'] == host, "The task is on {}, but it is supposed to be on {}".format(task['host'], host)
def test_private_repository_mesos_app(): """Deploys an app with a private Docker image, using Mesos containerizer. It relies on the global `install_enterprise_cli` fixture to install the enterprise-cli-package. """ username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json(username, password) secret_value = json.dumps(secret_value_json) app_def = apps.private_ucr_docker_app() app_id = app_def["id"] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if is_strict(): app_def['user'] = '******' common.add_dcos_marathon_user_acls() common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_app(app_def) deployment_wait(service_id=app_id) common.assert_app_tasks_running(client, app_def) finally: common.delete_secret(secret_name)
def test_launch_docker_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_id = '/launch-docker-grace-period-app' app_def = apps.docker_http_server(app_id) app_def['container']['docker']['image'] = 'kensipe/python-test' default_grace_period = 3 grace_period = 20 app_def['taskKillGracePeriodSeconds'] = grace_period app_def['cmd'] = 'python test.py' task_name = app_id.lstrip('/') client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None client.scale_app(app_id, 0) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None # tasks should still be here after the default_graceperiod time.sleep(default_grace_period + 1) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) assert_that(lambda: get_service_task(marathon_service_name, task_name), eventually(equal_to(None), max_attempts=30))
def test_app_update(): """Tests that an app gets successfully updated.""" app_def = apps.mesos_app(app_id='/update-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) app_def['cpus'] = 1 app_def['instances'] = 2 client.update_app(app_id, app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format( len(tasks))
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_private_repository_mesos_app(): """Deploys an app with a private Docker image, using Mesos containerizer. It relies on the global `install_enterprise_cli` fixture to install the enterprise-cli-package. """ username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json( username, password) secret_value = json.dumps(secret_value_json) app_def = apps.private_ucr_docker_app() app_id = app_def["id"] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if is_strict(): app_def['user'] = '******' common.add_dcos_marathon_user_acls() common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_app(app_def) deployment_wait(service_id=app_id) common.assert_app_tasks_running(client, app_def) finally: common.delete_secret(secret_name)
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format( len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_create_pod_with_private_image(): """Deploys a pod with a private Docker image, using Mesos containerizer. This method relies on the global `install_enterprise_cli` fixture to install the enterprise-cli-package. """ username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json(username, password) secret_value = json.dumps(secret_value_json) pod_def = pods.private_docker_pod() pod_id = pod_def['id'] common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_pod(pod_def) deployment_wait(service_id=pod_id, max_attempts=300) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created" finally: common.delete_secret(secret_name)
def assert_mom_ee(version, security_mode='permissive'): ensure_service_account() ensure_permissions() ensure_sa_secret(strict=True if security_mode == 'strict' else False) ensure_docker_config_secret() # In strict mode all tasks are started as user `nobody` by default. However we start # MoM-EE as 'root' and for that we need to give root marathon ACLs to start # tasks as 'root'. if security_mode == 'strict': common.add_dcos_marathon_user_acls() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile(app_def_file), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) logger.info('Deploying {} definition with {} image'.format(app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker']['image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) shakedown.dcos.service.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode), path="ping")
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) common.kill_process_on_host(host, '[s]leep') deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) common.kill_process_on_host(host, '[s]leep') deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0][ 'id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_docker_dns_mapping(marathon_service_name): """Tests that a running Docker task is accessible via DNS.""" app_def = apps.docker_http_server(app_id='/docker-dns-mapping-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad' status, output = run_command_on_master(bad_cmd) assert not status @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_dns(): dnsname = '{}.{}.mesos'.format(app_id.lstrip('/'), marathon_service_name) cmd = 'ping -c 1 {}'.format(dnsname) wait_for_dns(dnsname) status, output = run_command_on_master(cmd) assert status, "ping failed for app using DNS lookup: {}".format( dnsname) check_dns()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') wait_for_task('marathon', 'marathon-user', 300) wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_vip_docker_bridge_mode(marathon_service_name): """Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ app_def = apps.docker_http_server(app_id='vip-docker-bridge-mode-app') app_id = app_def["id"] vip_name = app_id.lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['id'] = vip_name app_def['container']['docker']['portMappings'] = [{ "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) }] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_and_scale_group(): """Launches and scales a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] client = marathon.create_client() client.create_group(group_def) deployment_wait(service_id=app1_id) group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scale by 2 for the entire group client.scale_group(groups_id, 2) deployment_wait(service_id=app1_id) tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(len(tasks2))
def test_scale_app_in_group(): """Scales an individual app in a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] client = marathon.create_client() client.create_group(group_def) deployment_wait(service_id=app1_id) group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scaling just one app in the group client.scale_app(app1_id, 2) deployment_wait(service_id=app1_id) tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after scale, but 1 was expected".format(len(tasks2))
def test_install_marathon(): """Install the Marathon package for DC/OS. """ # Install @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def install_marathon(): install_package_and_wait(PACKAGE_NAME) install_marathon() assert package_installed(PACKAGE_NAME), 'Package failed to install' # 5000ms = 5 seconds, 5 seconds * 60 attempts = 300 seconds = WAIT_TIME_IN_SECS @retrying.retry(wait_fixed=5000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def assert_service_registration(package, service): found = get_service(package) is not None assert found and service_healthy( service ), f"Service {package} did not register with DCOS" # NOQA E999 assert_service_registration(PACKAGE_NAME, SERVICE_NAME) deployment_wait(service_id=SERVICE_NAME) # Uninstall uninstall('marathon-user') deployment_wait(service_id=SERVICE_NAME) # Reinstall install_package_and_wait(PACKAGE_NAME) assert package_installed(PACKAGE_NAME), 'Package failed to reinstall'
def test_install_marathon(): """Install the Marathon package for DC/OS. """ # Install @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def install_marathon(): install_package_and_wait(PACKAGE_NAME) install_marathon() assert package_installed(PACKAGE_NAME), 'Package failed to install' # 5000ms = 5 seconds, 5 seconds * 60 attempts = 300 seconds = WAIT_TIME_IN_SECS @retrying.retry(wait_fixed=5000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def assert_service_registration(package, service): found = get_service(package) is not None assert found and service_healthy(service), f"Service {package} did not register with DCOS" # NOQA E999 assert_service_registration(PACKAGE_NAME, SERVICE_NAME) deployment_wait(service_id=SERVICE_NAME) # Uninstall uninstall('marathon-user') deployment_wait(service_id=SERVICE_NAME) # Reinstall install_package_and_wait(PACKAGE_NAME) assert package_installed(PACKAGE_NAME), 'Package failed to reinstall'
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() logger.info('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments)) assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
def clear_pods(): try: client = marathon.create_client() pods = client.list_pod() for pod in pods: client.remove_pod(pod["id"], True) deployment_wait(service_id=pod["id"]) except Exception: pass
def test_install_universe_package(package): """ Marathon is responsible for installing packages from the universe. This test confirms that several packages are installed into a healty state. """ install_package_and_wait(package) assert package_installed(package), 'Package failed to install' deployment_wait(max_attempts=300) assert service_healthy(package)
def test_app_secret_env_var(secret_fixture): secret_name, secret_value = secret_fixture app_id = '/app-secret-env-var-{}'.format(uuid.uuid4().hex) app_def = { "id": app_id, "instances": 1, "cpus": 0.5, "mem": 64, "cmd": "echo $SECRET_ENV >> $MESOS_SANDBOX/secret-env && /opt/mesosphere/bin/python -m http.server $PORT_API", "env": { "SECRET_ENV": { "secret": "secret1" } }, "portDefinitions": [{ "port": 0, "protocol": "tcp", "name": "api", "labels": {} }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks) == 1, 'Failed to start the secret environment variable app' port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/secret-env".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def value_check(): status, data = run_command_on_master(cmd) assert status, "{} did not succeed".format(cmd) assert data.rstrip() == secret_value value_check()
def test_app_file_based_secret(secret_fixture): secret_name, secret_value = secret_fixture secret_container_path = 'mysecretpath' app_id = '/app-fbs-{}'.format(uuid.uuid4().hex) # In case you're wondering about the `cmd`: secrets are mounted via tmpfs inside # the container and are not visible outside, hence the intermediate file app_def = { "id": app_id, "instances": 1, "cpus": 0.5, "mem": 64, "cmd": "cat {} >> {}_file && /opt/mesosphere/bin/python -m http.server $PORT_API".format( secret_container_path, secret_container_path), "container": { "type": "MESOS", "volumes": [{ "containerPath": secret_container_path, "secret": "secret1" }] }, "portDefinitions": [{ "port": 0, "protocol": "tcp", "name": "api", "labels": {} }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, 'Failed to start the file based secret app' port = tasks[0]['ports'][0] host = tasks[0]['host'] # The secret by default is saved in $MESOS_SANDBOX/.secrets/path/to/secret cmd = "curl {}:{}/{}_file".format(host, port, secret_container_path) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def value_check(): status, data = run_command_on_master(cmd) assert status, "{} did not succeed. status = {}, data = {}".format(cmd, status, data) assert data.rstrip() == secret_value, "Got an unexpected secret data" value_check()
def uninstall(service, package=PACKAGE_NAME): try: task = get_service_task(package, service) if task is not None: cosmos_pm = packagemanager.PackageManager(cosmos.get_cosmos_url()) cosmos_pm.uninstall_app(package, True, service) deployment_wait() assert common.wait_for_service_endpoint_removal('test-marathon') delete_zk_node('/universe/{}'.format(service)) except Exception: pass
def test_deploy_custom_framework(): """Launches an app that has necessary elements to create a service endpoint in DCOS. This test confirms that the endpoint is created by the root Marathon. """ client = marathon.create_client() app_def = apps.fake_framework() app_id = app_def["id"] client.add_app(app_def) deployment_wait(service_id=app_id, max_attempts=300) shakedown.dcos.service.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds())
def test_custom_service_name(): """ Install MoM with a custom service name. """ cosmos_pm = packagemanager.PackageManager(cosmos.get_cosmos_url()) cosmos_pm.get_package_version('marathon', None) options = { 'service': {'name': "test-marathon"} } install_package('marathon', options_json=options) deployment_wait(service_id=options["service"]["name"], max_attempts=300) shakedown.dcos.service.wait_for_service_endpoint('test-marathon', timeout_sec=300, path="ping")
def test_create_pod(): """Launch simple pod in DC/OS root marathon.""" pod_def = pods.simple_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) deployment_wait(service_id=pod_id) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created"
def simple_sleep_app(mom_endpoint): # Deploy a simple sleep app in the MoM-EE with marathon_on_marathon(name=mom_endpoint) as client: app_def = apps.sleep_app() app_id = app_def["id"] client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = get_service_task(mom_endpoint, app_id.lstrip("/")) logger.info('MoM-EE tasks: {}'.format(tasks)) return tasks is not None
def test_app_with_persistent_volume_recovers(): """Tests that when an app task with a persistent volume gets killed, it recovers on the node it was launched on, and it gets attached to the same persistent-volume.""" app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) task_id = tasks[0]['id'] port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task(cmd, target_data): run, data = run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert target_data in data, "'{}' not found in {}".format(target_data, data) check_task(cmd, target_data='hello\n') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def kill_task(host, pattern): pids = common.kill_process_on_host(host, pattern) assert len(pids) != 0, "no task got killed on {} for pattern {}".format(host, pattern) kill_task(host, '[h]ttp\\.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after recovery, but 1 was expected".format(len(tasks)) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task ID has not changed, and is still {}".format(task_id) check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) check_task(cmd, target_data='hello\nhello\n')
def test_https_readiness_check_ready(): """Tests HTTPS readiness check using a prepared nginx image that enables SSL (using self-signed certificate) and listens on 443. """ client = marathon.create_client() app_def = apps.app_with_https_readiness_checks() app_id = app_def["id"] client.add_app(app_def) # when readiness check keeps failing, the deployment will never finish deployment_wait(service_id=app_id, max_attempts=300)
def test_launch_app_on_public_agent(): """ Test the successful launch of a mesos container on public agent. MoMs by default do not have slave_public access. """ client = marathon.create_client() app_def = common.add_role_constraint_to_app_def(apps.mesos_app(), ['slave_public']) app_id = app_def["id"] client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) task_ip = tasks[0]['host'] assert task_ip in get_public_agents(), "The application task got started on a private agent"