def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with marathon_on_marathon() as client: client.add_app(app_def) wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] logger.info("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds(), path="ping") with marathon_on_marathon() as client: wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') wait_for_task('marathon', 'marathon-user', 300) wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with marathon_on_marathon() as client: client.add_app(app_def) wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] logger.info("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds(), path="ping") with marathon_on_marathon() as client: wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def wait_for_marathon_user_and_cleanup(): wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") with marathon_on_marathon() as client: yield wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") common.clean_up_marathon(client)
def simple_sleep_app(mom_endpoint): # Deploy a simple sleep app in the MoM-EE with marathon_on_marathon(name=mom_endpoint) as client: app_def = apps.sleep_app() app_id = app_def["id"] client.add_app(app_def) common.deployment_wait(service_id=app_id, client=client) tasks = get_service_task(mom_endpoint, app_id.lstrip("/")) logger.info('MoM-EE tasks: {}'.format(tasks)) return tasks is not None
def simple_sleep_app(mom_endpoint): # Deploy a simple sleep app in the MoM-EE with marathon_on_marathon(name=mom_endpoint) as client: app_def = apps.sleep_app() app_id = app_def["id"] client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = get_service_task(mom_endpoint, app_id.lstrip("/")) logger.info('MoM-EE tasks: {}'.format(tasks)) return tasks is not None
def teardown_module(module): with marathon_on_marathon() as client: try: common.clean_up_marathon(client=client) except Exception: pass uninstall_package_and_wait('marathon') delete_zk_node('universe/marathon-user') # Remove everything from root marathon common.clean_up_marathon()
def remove_mom_ee(): mom_ee_versions = [('1.6', 'strict'), ('1.6', 'permissive'), ('1.5', 'strict'), ('1.5', 'permissive'), ('1.4', 'strict'), ('1.4', 'permissive')] for mom_ee in mom_ee_versions: endpoint = mom_ee_endpoint(mom_ee[0], mom_ee[1]) logger.info('Checking endpoint: {}'.format(endpoint)) if service_available_predicate(endpoint): logger.info('Removing {}...'.format(endpoint)) with marathon_on_marathon(name=endpoint) as client: delete_all_apps(client=client) client = marathon.create_client() client.remove_app(MOM_EE_NAME) common.deployment_wait(MOM_EE_NAME) logger.info('Successfully removed {}'.format(MOM_EE_NAME))
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() app_id = app_def["id"] with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) try: wait_for_service_endpoint('pyfw', 15) except Exception: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def cluster_info(mom_name='marathon-user'): logger.info("DC/OS: %s, in %s mode", dcos_version(), ee_version()) agents = get_private_agents() logger.info("Agents: %d", len(agents)) client = marathon.create_client() about = client.get_about() logger.info("Marathon version: %s", about.get("version")) if service_available_predicate(mom_name): with marathon_on_marathon(mom_name) as client: try: about = client.get_about() logger.info("Marathon MoM version: {}".format(about.get("version"))) except Exception: logger.info("Marathon MoM not present") else: logger.info("Marathon MoM not present")
def cluster_info(mom_name='marathon-user'): logger.info("DC/OS: %s, in %s mode", dcos_version(), ee_version()) agents = get_private_agents() logger.info("Agents: %d", len(agents)) client = marathon.create_client() about = client.get_about() logger.info("Marathon version: %s", about.get("version")) if service_available_predicate(mom_name): with marathon_on_marathon(mom_name) as client: try: about = client.get_about() logger.info("Marathon MoM version: {}".format( about.get("version"))) except Exception: logger.info("Marathon MoM not present") else: logger.info("Marathon MoM not present")
def remove_mom_ee(): mom_ee_versions = [ ('1.7', 'strict'), ('1.7', 'permissive'), ('1.6', 'strict'), ('1.6', 'permissive'), ('1.5', 'strict'), ('1.5', 'permissive') ] for mom_ee in mom_ee_versions: endpoint = mom_ee_endpoint(mom_ee[0], mom_ee[1]) logger.info('Checking endpoint: {}'.format(endpoint)) if service_available_predicate(endpoint): logger.info('Removing {}...'.format(endpoint)) with marathon_on_marathon(name=endpoint) as client: delete_all_apps(client=client) client = marathon.create_client() client.remove_app(MOM_EE_NAME) deployment_wait(MOM_EE_NAME) logger.info('Successfully removed {}'.format(MOM_EE_NAME))
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with marathon_on_marathon() as client: client.add_app(app_def) deployment_wait(service_id=app_id, client=client) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') wait_for_task('marathon', 'marathon-user', 300) wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def setup_module(module): common.ensure_mom() common.cluster_info() with marathon_on_marathon() as client: common.clean_up_marathon(client=client)