def test_marathon_when_disconnected_from_zk(): """ Launch an app from Marathon. Then knock out access to zk from the MoM. Verify the task is still good. """ app_def = app('zk-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/zk-failure') original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): block_port(host, 2181) # time of the zk block time.sleep(10) # after access to zk is restored. @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/zk-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_pod_health_failed_check(): """Deploys a pod with correct health checks, then partitions the network and verifies that the tasks get restarted with new task IDs. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] common.save_iptables(host) common.block_port(host, port) time.sleep(7) common.restore_iptables(host) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for task in tasks: assert task[ 'id'] != initial_id1, "One of the tasks has not been restarted" assert task[ 'id'] != initial_id2, "One of the tasks has not been restarted"
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_id = app_def["id"] app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_id) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition common.block_iptable_rules_for_seconds(host, port, sleep_seconds=10, block_input=True, block_output=False) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format( task_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_health_failed_check(): """ Deploys a pod with good health checks, then partitions the network and verifies the tasks return with new task ids. """ client = marathon.create_client() pod_id = "/pod-ken".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id host = ip_other_than_mom() pin_pod_to_host(pod_json, host) client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) for task in tasks: assert task['id'] != initial_id1 assert task['id'] != initial_id2
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pod_health_failed_check(): """Deploys a pod with correct health checks, then partitions the network and verifies that the tasks get restarted with new task IDs. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] common.block_iptable_rules_for_seconds(host, port, 7, block_input=True, block_output=False) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for new_task in tasks: new_task_id = new_task['id'] assert new_task_id != initial_id1, f"Task {new_task_id} has not been restarted" # NOQA E999 assert new_task_id != initial_id2, f"Task {new_task_id} has not been restarted"
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() print('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len( deployments ) == 1, "The number of deployments is {}, but 1 was expected".format( len(deployments)) assert len( tasks) == 1, "The number of tasks is {}, but 1 was expected".format( len(tasks))
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.systemctl_master('restart') common.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_def["id"]) assert len( tasks ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format( len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000, retry_on_exception=retry_on_exception) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id check_task_recovery()
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_def['cpus'] = 3.5 host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() client.scale_app(app_def["id"], 2) time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks(app_def["id"]) # still deploying assert len( deployments ) == 1, "The number of deployments is {}, but 1 was expected".format( len(deployments)) assert len( tasks) == 1, "The number of tasks is {}, but 1 was expected".format( len(tasks))
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) common.wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) common.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] != new_tasks[0][ 'id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) common.kill_process_on_host(host, '[s]leep') deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_pinned_task_scales_on_host_only(): """Tests that a pinned app scales only on the pinned node.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) assert tasks[0]['host'] == host, \ "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host) client.scale_app(app_id, 10) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 10, "The number of tasks is {} after scale, but 10 was expected".format(len(tasks)) for task in tasks: assert task['host'] == host, "The task is on {}, but it is supposed to be on {}".format(task['host'], host)
def test_pinned_task_scales_on_host_only(): """Tests that a pinned app scales only on the pinned node.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) assert tasks[0]['host'] == host, \ "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host) client.scale_app(app_def["id"], 10) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) assert len( tasks ) == 10, "The number of tasks is {} after scale, but 10 was expected".format( len(tasks)) for task in tasks: assert task[ 'host'] == host, "The task is on {}, but it is supposed to be on {}".format( task['host'], host)
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() logger.info('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments)) assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_def["id"]) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition with shakedown.iptable_rules(host): common.block_port(host, port) time.sleep(10) shakedown.deployment_wait() app = client.get_app(app_def["id"]) tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format(task_id) app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_health_failed_check(): """ Tests a health check of an app launched by marathon. The health check succeeded, then failed due to a network partition. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check()) app_def['id'] = 'healthy' app_def['healthChecks'] = health_list pin_to_host(app_def, ip_other_than_mom()) client.add_app(app_def) shakedown.deployment_wait() # healthy app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 tasks = client.get_tasks('/healthy') host = tasks[0]['host'] port = tasks[0]['ports'][0] # prefer to break at the agent (having issues) mom_ip = ip_of_mom() shakedown.save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() # after network failure is restored. The task returns and is a new task ID @retrying.retry(wait_fixed=1000, stop_max_delay=3000, retry_on_exception=ignore_on_exception) def check_health_message(): new_tasks = client.get_tasks('/healthy') assert new_tasks[0]['id'] != tasks[0]['id'] app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 check_health_message()
def test_marathon_when_task_agent_bounced(): """ Launch an app and restart the node the task is on. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_pin_pod(): """Tests that a pod can be pinned to a specific host.""" pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(len(tasks)) pod = client.list_pod()[0] assert pod['instances'][0]['agentHostname'] == host, "The pod didn't get pinned to {}".format(host)
def test_pinned_task_does_not_find_unknown_host(): """ Tests that a task pinned to an unknown host will not launch. within 10 secs it is still in deployment and 0 tasks are running. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, '10.255.255.254') # only 1 can fit on the node app_def['cpus'] = 3.5 with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) # deploys are within secs # assuming after 10 no tasks meets criteria time.sleep(10) tasks = client.get_tasks('/pinned') assert len(tasks) == 0
def test_pinned_task_does_not_find_unknown_host(): """ Tests that a task pinned to an unknown host will not launch. within 10 secs it is still in deployment and 0 tasks are running. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, '10.255.255.254') # only 1 can fit on the node app_def['cpus'] = 3.5 client = marathon.create_client() client.add_app(app_def) # deploys are within secs # assuming after 10 no tasks meets criteria time.sleep(10) tasks = client.get_tasks('/pinned') assert len(tasks) == 0
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_health_failed_check(): """ Tests a health check of an app launched by marathon. The health check succeeded, then failed due to a network partition. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check()) app_def['id'] = 'healthy' app_def['healthChecks'] = health_list pin_to_host(app_def, ip_other_than_mom()) client.add_app(app_def) shakedown.deployment_wait() # healthy app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 tasks = client.get_tasks('/healthy') host = tasks[0]['host'] port = tasks[0]['ports'][0] # prefer to break at the agent (having issues) mom_ip = ip_of_mom() shakedown.save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() # after network failure is restored. The task returns and is a new task ID @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_health_message(): new_tasks = client.get_tasks('/healthy') assert new_tasks[0]['id'] != tasks[0]['id'] app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 check_health_message()
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server("app-network-split") app_id = app_def["id"] app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_id) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition common.block_iptable_rules_for_seconds(host, port, sleep_seconds=10, block_input=True, block_output=False) # Network partition should cause the task to restart N times until the partition is resolved (since we # pinned the task to the split agent). A new task with a new taskId should eventually be running and healthy. @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format(task_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_pin_pod(): """ Tests that we can pin a pod to a host. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id host = ip_other_than_mom() pin_pod_to_host(pod_json, host) client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) assert len(tasks) == 2 pod = client.list_pod()[0] assert pod['instances'][0]['agentHostname'] == host
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_pinned_task_scales_on_host_only(): """ Tests that scaling a pinned app scales only on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 1 assert tasks[0]['host'] == host client.scale_app('pinned', 10) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 10 for task in tasks: assert task['host'] == host
def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_pinned_task_does_not_scale_to_unpinned_host(): """ Tests when a task lands on a pinned node (and barely fits) when asked to scale past the resources of that node will not scale. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) # only 1 can fit on the node app_def['cpus'] = 3.5 with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') client.scale_app('pinned', 2) # typical deployments are sub 3 secs time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks('/pinned') # still deploying assert len(deployments) == 1 assert len(tasks) == 1
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pinned_task_does_not_scale_to_unpinned_host(): """ Tests when a task lands on a pinned node (and barely fits) when asked to scale past the resources of that node will not scale. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) # only 1 can fit on the node app_def['cpus'] = 3.5 client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') client.scale_app('pinned', 2) # typical deployments are sub 3 secs time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks('/pinned') # still deploying assert len(deployments) == 1 assert len(tasks) == 1
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_def['cpus'] = 3.5 host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() client.scale_app(app_def["id"], 2) time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks(app_def["id"]) # still deploying assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments)) assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))