def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) common.wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] logger.info("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) common.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds(), path="ping") with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds()) with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports """ # get MoM ip mom_ip = ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = get_resource("{}/large-sleep.json".format(fixture_dir())) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') original_sleep_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min service_delay() # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) service_delay() shakedown.wait_for_service_endpoint(PACKAGE_APP_ID) shakedown.wait_for_task("marathon-user", "sleep") with marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') current_sleep_task_id = tasks[0]["id"] assert current_sleep_task_id == original_sleep_task_id, "Task ID shouldn't change"
def test_health_failed_check(): """ Tests a health check of an app launched by marathon. The health check succeeded, then failed due to a network partition. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check()) app_def['id'] = 'healthy' app_def['healthChecks'] = health_list pin_to_host(app_def, ip_other_than_mom()) client.add_app(app_def) shakedown.deployment_wait() # healthy app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 tasks = client.get_tasks('/healthy') host = tasks[0]['host'] port = tasks[0]['ports'][0] # prefer to break at the agent (having issues) mom_ip = ip_of_mom() shakedown.save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() # after network failure is restored. The task returns and is a new task ID @retrying.retry(wait_fixed=1000, stop_max_delay=3000, retry_on_exception=ignore_on_exception) def check_health_message(): new_tasks = client.get_tasks('/healthy') assert new_tasks[0]['id'] != tasks[0]['id'] app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 check_health_message()
def test_health_failed_check(): """ Tests a health check of an app launched by marathon. The health check succeeded, then failed due to a network partition. """ client = marathon.create_client() app_def = python_http_app() health_list = [] health_list.append(health_check()) app_def['id'] = 'healthy' app_def['healthChecks'] = health_list pin_to_host(app_def, ip_other_than_mom()) client.add_app(app_def) shakedown.deployment_wait() # healthy app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 tasks = client.get_tasks('/healthy') host = tasks[0]['host'] port = tasks[0]['ports'][0] # prefer to break at the agent (having issues) mom_ip = ip_of_mom() shakedown.save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() # after network failure is restored. The task returns and is a new task ID @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_health_message(): new_tasks = client.get_tasks('/healthy') assert new_tasks[0]['id'] != tasks[0]['id'] app = client.get_app('/healthy') assert app['tasksRunning'] == 1 assert app['tasksHealthy'] == 1 check_health_message()
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_mom_when_mom_agent_bounced(): """ Launch an app from MoM and restart the node MoM is on. """ app_def = app('agent-failure') mom_ip = ip_of_mom() host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_mom_with_network_failure(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with marathon_on_marathon() as client: client.add_app(app_def) wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") wait_for_task("marathon-user", app_id.lstrip('/')) with marathon_on_marathon() as client: wait_for_task("marathon-user", app_id.lstrip('/')) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()