def _kill(app_id, scale, host): """ :param app_id: the id of the application :type app_id: str :param: scale: Scale the app down :type: scale: bool :param: host: Kill only those tasks running on host specified :type: string :returns: process return code :rtype: int """ client = marathon.create_client() payload = client.kill_tasks(app_id, host=host, scale=scale) # If scale is provided, the API return a "deploymentResult" # https://github.com/mesosphere/marathon/blob/50366c8/src/main/scala/mesosphere/marathon/api/RestResource.scala#L34-L36 if scale: emitter.publish("Started deployment: {}".format(payload)) else: if 'tasks' in payload: emitter.publish('Killed tasks: {}'.format(payload['tasks'])) if len(payload['tasks']) == 0: return 1 else: emitter.publish('Killed tasks: []') return 1 return 0
def test_scale_app_in_group(): """ Tests the scaling of an individual app in a group """ client = marathon.create_client() try: client.remove_group('/test-group', True) shakedown.deployment_wait() except Exception as e: pass client.create_group(group()) shakedown.deployment_wait() group_apps = client.get_group('/test-group/sleep') apps = group_apps['apps'] assert len(apps) == 2 tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 1 assert len(tasks2) == 1 # scaling just an app in the group client.scale_app('/test-group/sleep/goodnight', 2) shakedown.deployment_wait() tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 2 assert len(tasks2) == 1
def _deployment_list(app_id, json_): """ :param app_id: the application id :type app_id: str :param json_: output json if True :type json_: bool :returns: process return code :rtype: int """ client = marathon.create_client() deployments = client.get_deployments(app_id) if not deployments and not json_: msg = "There are no deployments" if app_id: msg += " for '{}'".format(app_id) raise DCOSException(msg) emitting.publish_table(emitter, deployments, tables.deployment_table, json_) return 0
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_def["id"]) assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_app_update_rollback(): """Tests that an updated app can be rolled back to its initial version.""" app_def = apps.readiness_and_health_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) app_def['instances'] = 2 client.update_app(app_id, app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 2, "The number of tasks is {} after update, but 2 was expected".format(len(tasks)) # provides a testing delay to rollback in the meantime app_def['readinessChecks'][0]['intervalSeconds'] = 30 app_def['instances'] = 1 deployment_id = client.update_app(app_id, app_def) client.rollback_deployment(deployment_id) shakedown.deployment_wait() # update to 1 instance is rollback to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2, "The number of tasks is {} after rollback, but 2 was expected".format(len(tasks))
def test_unhealthy_app_can_be_rolled_back(): """Verifies that an updated app gets rolled back due to being unhealthy.""" app_def = apps.readiness_and_health_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) app_def['healthChecks'][0]['path'] = '/non-existent' app_def['instances'] = 2 deployment_id = client.update_app(app_id, app_def) try: shakedown.deployment_wait() except Exception: client.rollback_deployment(deployment_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after rollback, but 1 was expected".format(len(tasks))
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() print('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments)) assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
def test_launch_mesos_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Mesos container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_def = apps.mesos_app() default_grace_period = 3 grace_period = 20 app_def['fetch'] = [{"uri": "https://downloads.mesosphere.com/testing/test.py"}] app_def['cmd'] = '/opt/mesosphere/bin/python test.py' app_def['taskKillGracePeriodSeconds'] = grace_period app_id = app_def['id'].lstrip('/') client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # tasks should still be here after the default_grace_period time.sleep(default_grace_period + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def test_launch_and_scale_group(): """Launches and scales a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] client = marathon.create_client() client.create_group(group_def) shakedown.deployment_wait() group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scale by 2 for the entire group client.scale_group(groups_id, 2) shakedown.deployment_wait() tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(len(tasks2))
def test_scale_app_in_group(): """Scales an individual app in a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] client = marathon.create_client() client.create_group(group_def) shakedown.deployment_wait() group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scaling just one app in the group client.scale_app(app1_id, 2) shakedown.deployment_wait() tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after scale, but 1 was expected".format(len(tasks2))
def test_vip_mesos_cmd(marathon_service_name): """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP.""" app_def = apps.http_server() vip_name = app_def["id"].lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['portDefinitions'] = [{ "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } }] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_vip_docker_bridge_mode(marathon_service_name): """Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ app_def = apps.docker_http_server() vip_name = app_def["id"].lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['id'] = vip_name app_def['container']['docker']['portMappings'] = [{ "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) }] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_docker_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_def = apps.docker_http_server() app_def['container']['docker']['image'] = 'kensipe/python-test' default_grace_period = 3 grace_period = 20 app_def['taskKillGracePeriodSeconds'] = grace_period app_def['cmd'] = 'python test.py' app_id = app_def['id'].lstrip('/') client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # tasks should still be here after the default_graceperiod time.sleep(default_grace_period + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def test_event_channel(): """ Tests the event channel. The way events are verified is by streaming the events to a test.txt file. The fixture ensures the file is removed before and after the test. events checked are connecting, deploying a good task and killing a task. """ app_def = common.app_mesos() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_deployment_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'event_stream_attached' in stdout assert 'deployment_info' in stdout assert 'deployment_step_success' in stdout client.remove_app(app_id, True) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_kill_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'Killed' in stdout
def test_launch_mesos_root_marathon_graceperiod(): """ Test the 'taskKillGracePeriodSeconds' of a launched task from the root marathon. The default is 3 seconds. This tests setting that period to other than the default value. """ app_def = app_mesos() app_def['id'] = 'grace' default_graceperiod = 3 graceperiod = 20 app_def['taskKillGracePeriodSeconds'] = graceperiod fetch = [{ "uri": "https://downloads.mesosphere.com/testing/test.py" }] app_def['fetch'] = fetch app_def['cmd'] = '/opt/mesosphere/bin/python test.py' client = marathon.create_client() client.add_app(app_def) deployment_wait() tasks = get_service_task('marathon', 'grace') assert tasks is not None client.scale_app('/grace', 0) tasks = get_service_task('marathon', 'grace') assert tasks is not None # task should still be here after the default_graceperiod time.sleep(default_graceperiod + 1) tasks = get_service_task('marathon', 'grace') assert tasks is not None # but not after the set graceperiod time.sleep(graceperiod) tasks = get_service_task('marathon', 'grace') assert tasks is None
def _test_declined_offer(app_def, reason): """Used to confirm that offers were declined. The `processedOffersSummary` and these tests in general require 1.4+ marathon with the queue end point. The retry is the best possible way to "time" the success of the test. """ app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def verify_declined_offer(): deployments = client.get_deployments(app_id) assert len(deployments) == 1 offer_summary = client.get_queued_app(app_id)['processedOffersSummary'] role_summary = declined_offer_by_reason(offer_summary['rejectSummaryLastOffers'], reason) last_attempt = declined_offer_by_reason(offer_summary['rejectSummaryLaunchAttempt'], reason) assert role_summary['declined'] > 0, "There are no declined offers because of {}".format(reason) assert role_summary['processed'] > 0, "There are no processed offers for {}".format(reason) assert last_attempt['declined'] > 0, "There are no declined offers because of {}".format(reason) assert last_attempt['processed'] > 0, "There are no processed offers for {}".format(reason) verify_declined_offer()
def test_launch_container_with_persistent_volume(): """ Tests launching a task with PV. It will write to a file in the PV. The app is killed and restarted and we can still read from the PV. """ app_def = persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) client.restart_app(app_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\nhello\n', "'{}' was not equal to hello\\nhello\\n".format(data)
def test_docker_dns_mapping(marathon_service_name): """ Tests that a running docker task is accessible from DNS. """ app_id = uuid.uuid4().hex client = marathon.create_client() app_json = app_docker(app_id) client.add_app(app_json) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad' status, output = shakedown.run_command_on_master(bad_cmd) assert not status @retrying.retry(stop_max_attempt_number=30) def check_dns(): cmd = 'ping -c 1 {}.{}.mesos'.format(app_id, marathon_service_name) shakedown.wait_for_dns('{}.{}.mesos'.format(app_id, marathon_service_name)) status, output = shakedown.run_command_on_master(cmd) assert status check_dns()
def _log_marathon(follow, lines, ssh_config_file): """Prints the contents of the marathon logs. :param follow: same as unix tail's -f :type follow: bool :param lines: number of lines to print :type lines: int :param ssh_config_file: SSH config file. :type ssh_config_file: str | None ;:returns: process return code :rtype: int """ ssh_options = util.get_ssh_options(ssh_config_file, []) journalctl_args = "" if follow: journalctl_args += "-f " if lines: journalctl_args += "-n {} ".format(lines) leader_ip = marathon.create_client().get_leader().split(":")[0] cmd = ("ssh {0}core@{1} " + "journalctl {2}-u marathon").format(ssh_options, leader_ip, journalctl_args) emitter.publish(DefaultError("Running `{}`".format(cmd))) return subprocess.call(cmd, shell=True)
def _group_add(group_resource): """ :param group_resource: optional filename for the group resource :type group_resource: str :returns: process return code :rtype: int """ group_resource = _get_resource(group_resource) client = marathon.create_client() # Check that the group doesn't exist group_id = client.normalize_app_id(group_resource['id']) try: client.get_group(group_id) except DCOSException as e: logger.exception(e) else: raise DCOSException("Group '{}' already exists".format(group_id)) client.create_group(group_resource) return 0
def _add(app_resource): """ :param app_resource: optional filename for the application resource :type app_resource: str :returns: process return code :rtype: int """ application_resource = _get_resource(app_resource) # Add application to marathon client = marathon.create_client() # Check that the application doesn't exist app_id = client.normalize_app_id(application_resource['id']) try: client.get_app(app_id) except DCOSException as e: logger.exception(e) else: raise DCOSException("Application '{}' already exists".format(app_id)) client.add_app(application_resource) return 0
def _stop(app_id, force): """Stop a Marathon application :param app_id: the id of the application :type app_id: str :param force: whether to override running deployments :type force: bool :returns: process return code :rtype: int """ # Check that the application exists client = marathon.create_client() desc = client.get_app(app_id) if desc['instances'] <= 0: emitter.publish( 'Application {!r} already stopped: {!r} instances.'.format( app_id, desc['instances'])) return 1 app_json = {'instances': 0} deployment = client.update_app(app_id, app_json, force) emitter.publish('Created deployment {}'.format(deployment))
def test_vip_docker_bridge_mode(marathon_service_name): """ Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ vip_name = 'vip-docker-service' fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def = app_docker() app_def['container']['docker']['portMappings'] = [ { "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) } ] app_def['id'] = vip_name client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry def http_output_check(stop_max_attempt_number=30): common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_update_app_rollback(): """ Tests updating an app then rolling back the update. """ app_id = uuid.uuid4().hex app_def = readiness_and_health_app() app_def['id'] = app_id client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() # start with 1 tasks = client.get_tasks(app_id) assert len(tasks) == 1 app_def['instances'] = 2 client.update_app(app_id, app_def) shakedown.deployment_wait() # update works to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2 # provides a testing delay to rollback from app_def['readinessChecks'][0]['intervalSeconds'] = 30 app_def['instances'] = 1 deployment_id = client.update_app(app_id, app_def) client.rollback_deployment(deployment_id) shakedown.deployment_wait() # update to 1 instance is rollback to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2
def test_vip_mesos_cmd(marathon_service_name): """ Tests the creation of a VIP from a python command NOT in a docker. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ vip_name = 'vip-service' fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def = python_http_app() app_def['portDefinitions'] = [ { "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } } ] app_def['id'] = vip_name client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry def http_output_check(stop_max_attempt_number=30): common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_update_app_poor_health(): """ Tests updating an app with an automatic rollback due to poor health. """ app_id = uuid.uuid4().hex app_def = readiness_and_health_app() app_def['id'] = app_id client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() # start with 1 tasks = client.get_tasks(app_id) assert len(tasks) == 1 # provides a testing delay to rollback from app_def['healthChecks'][0]['path'] = '/non-existant' app_def['instances'] = 2 deployment_id = client.update_app(app_id, app_def) # 2 min wait try: shakedown.deployment_wait() except: client.rollback_deployment(deployment_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1
def test_launch_docker_graceperiod(marathon_service_name): """ Test the 'taskKillGracePeriodSeconds' in a Marathon environment. This is the same test as above however tests against docker. """ app_id = uuid.uuid4().hex app_def = app_docker(app_id) app_def['container']['docker']['image'] = 'kensipe/python-test' default_graceperiod = 3 graceperiod = 20 app_def['taskKillGracePeriodSeconds'] = graceperiod app_def['cmd'] = 'python test.py' client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # task should still be here after the default_graceperiod time.sleep(default_graceperiod + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set graceperiod time.sleep(graceperiod) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def _restart(app_id, force): """ :param app_id: the id of the application :type app_id: str :param force: whether to override running deployments :type force: bool :returns: process return code :rtype: int """ client = marathon.create_client() desc = client.get_app(app_id) if desc['instances'] <= 0: app_id = client.normalize_app_id(app_id) emitter.publish( 'Unable to perform rolling restart of application {!r} ' 'because it has no running tasks'.format( app_id, desc['instances'])) return 1 payload = client.restart_app(app_id, force) emitter.publish('Created deployment {}'.format(payload['deploymentId'])) return 0
def test_marathon_when_disconnected_from_zk(): """ Launch an app from Marathon. Then knock out access to zk from the MoM. Verify the task is still good. """ app_def = app('zk-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/zk-failure') original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): block_port(host, 2181) # time of the zk block time.sleep(10) # after access to zk is restored. @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/zk-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_private_repository_docker_app(): username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] agents = shakedown.get_private_agents() common.create_docker_credentials_file(username, password) common.copy_docker_credentials_file(agents) app_def = apps.private_docker_app() app_id = app_def["id"] if shakedown.ee_version() == 'strict': app_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) common.assert_app_tasks_running(client, app_def)
def test_pin_pod(): """ Tests that we can pin a pod to a host. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id host = ip_other_than_mom() pin_pod_to_host(pod_json, host) client.add_pod(pod_json) deployment_wait() tasks = get_pod_tasks(pod_id) assert len(tasks) == 2 pod = client.list_pod()[0] assert pod['instances'][0]['agentHostname'] == host
def test_pod_port_communication(): """ Test that 1 container can establish a socket connection to the other container in the same pod. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id # sleeps 2, then container 2 checks communication with container 1. # if that timesout, the task completes resulting in 1 container running # otherwise it is expected that 2 containers are running. pod_json['containers'][1]['exec']['command'][ 'shell'] = 'sleep 2; curl -m 2 localhost:$ENDPOINT_HTTPENDPOINT; if [ $? -eq 7 ]; then exit; fi; /opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTPENDPOINT2' # NOQA client.add_pod(pod_json) deployment_wait() tasks = get_pod_tasks(pod_id) assert len(tasks) == 2
def test_run_app_with_specified_user(): """Runs an app with a given user (cnetos). CentOS is expected, since it has centos user by default.""" app_def = apps.sleep_app() app_def['user'] = '******' app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = client.get_tasks(app_id) task = tasks[0] assert task[ 'state'] == 'TASK_RUNNING', "The task is not running: {}".format( task['state']) app = client.get_app(app_def["id"]) assert app['user'] == 'centos', "The app's user is not centos: {}".format( app['user'])
def test_health_check_unhealthy(): """ Tests failed health checks of an app launched by marathon. This was a health check that never passed. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check('/bad-url', failures=0, timeout=0)) app_def['id'] = 'unhealthy' app_def['healthChecks'] = health_list client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_failure_message(): app = client.get_app('/unhealthy') assert app['tasksRunning'] == 1 and app['tasksHealthy'] == 0 and app[ 'tasksUnhealthy'] == 1 check_failure_message()
def test_run_app_with_non_existing_user(): """Runs an app with a non-existing user, which should be failing.""" app_def = apps.sleep_app() app_def['user'] = '******' client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_failure_message(): app = client.get_app(app_def["id"]) message = app['lastTaskFailure']['message'] error = "Failed to get user information for 'bad'" assert error in message, "Launched an app with a non-existing user: {}".format( app['user']) check_failure_message()
def test_update_app(): """ Tests update an app. """ app_id = uuid.uuid4().hex app_def = app_mesos(app_id) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 app_def['cpus'] = 1 app_def['instances'] = 2 client.update_app(app_id, app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 2
def test_bad_uri(): """ Tests marathon's response to launching a task with a bad url (a url that isn't fetchable) """ app_id = uuid.uuid4().hex app_def = app(app_id) fetch = [{"uri": "http://mesosphere.io/missing-artifact"}] app_def['fetch'] = fetch client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_failure_message(): appl = client.get_app(app_id) message = appl['lastTaskFailure']['message'] error = "Failed to fetch all URIs for container" assert error in message check_failure_message()
def _show(app_id, version): """Show details of a Marathon application. :param app_id: The id for the application :type app_id: str :param version: The version, either absolute (date-time) or relative :type version: str :returns: process return code :rtype: int """ client = marathon.create_client() if version is not None: version = _calculate_version(client, app_id, version) app = client.get_app(app_id, version=version) emitter.publish(app) return 0
def test_task_failure_recovers(): """ Tests that if a task is KILLED, it will be relaunched and the taskID is different. """ app_id = uuid.uuid4().hex app_def = app(app_id) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(stop_max_delay=10000) def check_new_task_id(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id'] check_new_task_id()
def test_two_pods_with_shared_volume(): """Confirms that 1 container can read data in a volume that was written from the other container. The reading container fails if it can't read the file. So if there are 2 tasks after 4 seconds we are good. """ pod_def = pods.ephemeral_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(len(tasks)) time.sleep(4) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after sleeping, but 2 was expected".format(len(tasks))
def test_scaleup_pods(): """Scaling up a pod from 1 to 10""" _clear_pods() client = marathon.create_client() pod_id = "/pod-scaleup" pod_json = _pods_json() pod_json["id"] = pod_id pod_json["scaling"]["instances"] = 1 client.add_pod(pod_json) deployment_wait() status = _pod_status(client, pod_id) assert len(status["instances"]) == 1 pod_json["scaling"]["instances"] = 10 client.update_pod(pod_id, pod_json) deployment_wait() status = _pod_status(client, pod_id) assert len(status["instances"]) == 10
def test_default_user(): """Ensures a task is started as root by default.""" app_def = apps.sleep_app() client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_def["id"]) user = app.get('user') assert user is None, "User is {}, but it should not have been set".format( user) tasks = client.get_tasks(app_def["id"]) host = tasks[0]['host'] success = shakedown.run_command_on_agent( host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'") assert success, "The app is running as non-root"
def test_rollback_before_ready(): """ Tests the rollback of an app that didn't complete readiness. """ client = marathon.create_client() fw = fake_framework_app() # testing 30 sec interval readiness_time = 30 fw['readinessChecks'][0]['intervalSeconds'] = readiness_time deployment_id = client.add_app(fw) # 2 secs later it is still deploying time.sleep(2) deployment = client.get_deployment(deployment_id) assert deployment['currentActions'][0]['readinessCheckResults'][0][ 'ready'] is False client.rollback_deployment(deployment_id) # normally deployment would take another 28 secs assert client.get_deployment(deployment_id) is None
def test_scaledown_pods(): """Scaling down a pod from 10 to 1""" client = marathon.create_client() pod_id = "/pod-scaleup" pod_json = _pods_json() pod_json["id"] = pod_id pod_json["scaling"]["instances"] = 10 client.add_pod(pod_json) shakedown.deployment_wait() status = _pod_status(client, pod_id) assert len(status["instances"]) == 10 pod_json["scaling"]["instances"] = 1 client.update_pod(pod_id, pod_json) shakedown.deployment_wait() status = _pod_status(client, pod_id) assert len(status["instances"]) == 1
def test_event_channel(): delete_all_apps_wait() app_def = app_mesos() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) deployment_wait() status, stdout = run_command_on_master('cat test.txt') assert 'event_stream_attached' in stdout assert 'deployment_info' in stdout assert 'deployment_step_success' in stdout client.remove_app(app_id) deployment_wait() status, stdout = run_command_on_master('cat test.txt') assert 'Killed' in stdout
def test_marathon_when_task_agent_bounced(): """ Launch an app and restart the node the task is on. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_pod_comm_via_volume(): """ Confirms that 1 container can read data from a volume that was written from the other container. Most of the test is in the `vol-pods.json`. The reading container will die if it can't read the file. So if there are 2 tasks after 4 secs were are good. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) # pods setup to have c1 write, ct2 read after 2 sec # there are 2 tasks, unless the file doesnt' exist, then there is 1 pod_json = _pods_json('vol-pods.json') pod_json["id"] = pod_id client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) assert len(tasks) == 2 time.sleep(4) assert len(tasks) == 2
def test_pod_container_network(): """ Tests using "container" network (using default network "dcos") """ client = marathon.create_client() pod_id = "/pod-container-net-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-container-net.json') pod_json["id"] = pod_id client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) network_info = tasks[0]['statuses'][0]['container_status']['network_infos'][0] assert network_info['name'] == "dcos" container_ip = network_info['ip_addresses'][0]['ip_address'] assert container_ip is not None url = "http://{}:80/".format(container_ip) common.assert_http_code(url)
def test_network_pinger(test_type, get_pinger_app, dns_format, marathon_service_name): """ This test runs a pinger app and a relay app. It retrieves the python app from the master via the new http service (which will be moving into shakedown). Then a curl call to the relay will invoke a call to the 2nd pinger app and return back pong to the relay then back to curl. It tests that 1 task can network communicate to another task on the given network It tests inbound and outbound connectivity test_type param is not used. It is passed so that it is clear which parametrized test is running or may be failing. """ client = marathon.create_client() pinger_app = get_pinger_app('pinger') relay_app = get_pinger_app('relay') pinger_dns = dns_format.format('pinger', marathon_service_name) relay_dns = dns_format.format('relay', marathon_service_name) # test pinger app to master shakedown.copy_file_to_master(fixture_dir() + "/pinger.py") with shakedown.master_http_service(): # need to add app with http service in place or it will fail to fetch client.add_app(pinger_app) client.add_app(relay_app) shakedown.deployment_wait() shakedown.wait_for_dns(relay_dns) relay_url = 'http://{}:7777/relay-ping?url={}:7777'.format( relay_dns, pinger_dns) @retrying.retry def http_output_check(stop_max_attempt_number=30): status, output = shakedown.run_command_on_master( 'curl {}'.format(relay_url)) assert status assert 'Pong /pinger' in output assert 'Relay from /relay' in output http_output_check()
def test_health_failed_check(): """ Tests a health check of an app launched by marathon. The health check succeeded, then failed due to a network partition. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check()) app_def['id'] = 'healthy' app_def['healthChecks'] = health_list pin_to_host(app_def, ip_other_than_mom()) client.add_app(app_def) shakedown.deployment_wait() # healthy app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 tasks = client.get_tasks('/healthy') host = tasks[0]['host'] port = tasks[0]['ports'][0] # prefer to break at the agent (having issues) mom_ip = ip_of_mom() shakedown.save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() # after network failure is restored. The task returns and is a new task ID @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_health_message(): new_tasks = client.get_tasks('/healthy') assert new_tasks[0]['id'] != tasks[0]['id'] app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1
def test_vip_docker_bridge_mode(marathon_service_name): """Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ app_def = apps.docker_http_server(app_id='vip-docker-bridge-mode-app') app_id = app_def["id"] vip_name = app_id.lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['id'] = vip_name app_def['container']['docker']['portMappings'] = [{ "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) }] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_container_with_persistent_volume(): """ Tests launching a task with PV. It will write to a file in the PV. The app is killed and restarted and we can still read from the PV. """ app_def = persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) client.restart_app(app_id) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000, retry_on_exception=ignore_on_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1 check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\nhello\n', "'{}' was not equal to hello\\nhello\\n".format( data)
def test_restart_container_with_persistent_volume(): """A task with a persistent volume, which writes to a file in the persistent volume, is launched. The app is killed and restarted and we can still read from the persistent volume what was written to it. """ app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) client.restart_app(app_id) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after recovery, but 1 was expected".format(len(tasks)) check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\nhello\n', "'{}' was not equal to hello\\nhello\\n".format(data)
def _list(json_, endpoints, app_id, package_name): """List installed apps :param json_: output json if True :type json_: bool :param endpoints: Whether to include a list of endpoints as port-host pairs :type endpoints: boolean :param app_id: App ID of app to show :type app_id: str :param package_name: The package to show :type package_name: str :returns: process return code :rtype: int """ config = util.get_config() init_client = marathon.create_client(config) installed = package.installed_packages(init_client, endpoints) # only emit those packages that match the provided package_name and app_id results = [] for pkg in installed: pkg_info = pkg.dict() if (_matches_package_name(package_name, pkg_info) and _matches_app_id(app_id, pkg_info)): if app_id: # if the user is asking a specific id then only show that id pkg_info['apps'] = [ app for app in pkg_info['apps'] if app == app_id ] results.append(pkg_info) if results or json_: emitting.publish_table(emitter, results, tables.package_table, json_) else: msg = ("There are currently no installed packages. " "Please use `dcos package install` to install a package.") raise DCOSException(msg) return 0
def uninstall(package_name, remove_all, app_id, cli, app): """Uninstalls a package. :param package_name: The package to uninstall :type package_name: str :param remove_all: Whether to remove all instances of the named app :type remove_all: boolean :param app_id: App ID of the app instance to uninstall :type app_id: str :param init_client: The program to use to run the app :type init_client: object :rtype: None """ if cli is False and app is False: cli = app = True uninstalled = False if cli: if subcommand.uninstall(package_name): uninstalled = True if app: num_apps = uninstall_app( package_name, remove_all, app_id, marathon.create_client(), mesos.DCOSClient()) if num_apps > 0: uninstalled = True if uninstalled: return None else: msg = 'Package [{}]'.format(package_name) if app_id is not None: msg += " with id [{}]".format(app_id) msg += " is not installed." raise DCOSException(msg)
def test_pod_multi_port(): """ Tests that 2 containers with a port each will properly provision with their unique port assignment. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id client.add_pod(pod_json) shakedown.deployment_wait() # time.sleep(1) pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port1 = container1['endpoints'][0]['allocatedHostPort'] container2 = pod['instances'][0]['containers'][1] port2 = container2['endpoints'][0]['allocatedHostPort'] assert port1 != port2
def _list(json_): """ :param json_: output json if True :type json_: bool :returns: process return code :rtype: int """ client = marathon.create_client() apps = client.get_apps() if json_: emitter.publish(apps) else: deployments = client.get_deployments() table = tables.app_table(apps, deployments) output = six.text_type(table) if output: emitter.publish(output) return 0
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_task_failure_recovers(): """Tests that if a task is KILLED, another one will be launched with a different ID.""" app_def = apps.sleep_app() app_def['cmd'] = 'sleep 1000' client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_def["id"]) tasks = client.get_tasks(app_def["id"]) old_task_id = tasks[0]['id'] host = tasks[0]['host'] common.kill_process_on_host(host, '[s]leep 1000') shakedown.deployment_wait() assert_that( lambda: client.get_tasks(app_def["id"])[0], eventually(has_value('id', not_(equal_to(old_task_id))), max_attempts=30))