def test_all_partition(): hosts = shakedown.get_service_ips(config.SERVICE_NAME) for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) config.check_running()
def test_config_update_then_scheduler_died(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME) config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_pods_restart_graceful_shutdown(): options = { "world": { "kill_grace_period": 30 } } sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=options) world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running() # ensure the SIGTERM was sent via the "all clean" message in the world # service's signal trap/handler, BUT not the shell command, indicated # by "echo". stdout = sdk_cmd.run_cli( "task log --completed --lines=1000 {}".format(world_ids[0])) clean_msg = None for s in stdout.split('\n'): if s.find('echo') < 0 and s.find('all clean') >= 0: clean_msg = s assert clean_msg is not None
def test_pod_restart(): hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0') # get current agent id: jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod info hello-0', json=True) old_agent = jsonobj[0]['info']['slaveId']['value'] jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart hello-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'hello-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'hello-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids) config.check_running() # check agent didn't move: jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod info hello-0', json=True) new_agent = jsonobj[0]['info']['slaveId']['value'] assert old_agent == new_agent
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = 'hello-0-server' log.info('Checking that {} is failing to launch within {}s'.format(task_name, wait_time)) original_statuses = sdk_tasks.get_status_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000*wait_time, retry_on_result=lambda res: not res) def wait_for_new_failures(): new_statuses = sdk_tasks.get_status_history(task_name) assert len(new_statuses) >= len(original_statuses) added_statuses = new_statuses[len(original_statuses):] log.info('New {} statuses: {}'.format(task_name, ', '.join(added_statuses))) return 'TASK_FAILED' in added_statuses wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) config.check_running()
def test_pods_restart_graceful_shutdown(): options = {"world": {"kill_grace_period": 30}} sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=options) world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running() # ensure the SIGTERM was sent via the "all clean" message in the world # service's signal trap/handler, BUT not the shell command, indicated # by "echo". stdout = sdk_cmd.run_cli("task log --completed --lines=1000 {}".format( world_ids[0])) clean_msg = None for s in stdout.split('\n'): if s.find('echo') < 0 and s.find('all clean') >= 0: clean_msg = s assert clean_msg is not None
def test_kill_hello_node(): config.check_running() hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0') sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids) config.check_running()
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least {}s'.format(wait_time)) # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False try: spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time) except shakedown.TimeoutExpired: print('Timeout reached as expected') # add the needed envvars in marathon and confirm that the deployment succeeds: config = marathon.get_config(PACKAGE_NAME) env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' marathon.update_app(PACKAGE_NAME, config) check_running()
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = "hello-0-server" log.info("Checking that {} is failing to launch within {}s".format(task_name, wait_time)) original_state_history = _get_state_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000 * wait_time, retry_on_result=lambda res: not res ) def wait_for_new_failures(): new_state_history = _get_state_history(task_name) assert len(new_state_history) >= len(original_state_history) added_state_history = new_state_history[len(original_state_history) :] log.info("Added {} state history: {}".format(task_name, ", ".join(added_state_history))) return "TASK_FAILED" in added_state_history wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] del env["SLEEP_DURATION"] env["TASKCFG_ALL_OUTPUT_FILENAME"] = "output" env["TASKCFG_ALL_SLEEP_DURATION"] = "1000" sdk_marathon.update_app(marathon_config) config.check_running()
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_deploy(): wait_time_in_seconds = 600 sdk_plan.wait_for_kicked_off_deployment(config.SERVICE_NAME) # taskcfg.yml will initially fail to deploy because several options are missing in the default # sdk_marathon.json.mustache. verify that the tasks are failing before continuing. task_name = 'hello-0-server' log.info('Checking that {} is failing to launch within {}s'.format(task_name, wait_time_in_seconds)) original_state_history = _get_state_history(task_name) # wait for new TASK_FAILEDs to appear: @retrying.retry( wait_fixed=1000, stop_max_delay=1000 * wait_time_in_seconds, retry_on_result=lambda res: not res) def wait_for_new_failures(): new_state_history = _get_state_history(task_name) assert len(new_state_history) >= len(original_state_history) added_state_history = new_state_history[len(original_state_history) :] log.info("Added {} state history: {}".format(task_name, ", ".join(added_state_history))) return "TASK_FAILED" in added_state_history wait_for_new_failures() # add the needed envvars in marathon and confirm that the deployment succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] del env["SLEEP_DURATION"] env["TASKCFG_ALL_OUTPUT_FILENAME"] = "output" env["TASKCFG_ALL_SLEEP_DURATION"] = "1000" sdk_marathon.update_app(marathon_config) config.check_running()
def test_config_update_then_scheduler_died(): world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') host = marathon.get_scheduler_host(PACKAGE_NAME) bump_world_cpus() tasks.kill_task_with_pattern('helloworld.scheduler.Main', host) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_mesos_v1_api(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) # Install Hello World using the v1 api. # Then, clean up afterwards. sdk_install.uninstall(config.PACKAGE_NAME, foldered_name) sdk_install.install(config.PACKAGE_NAME, foldered_name, config.DEFAULT_TASK_COUNT, additional_options={ "service": { "name": foldered_name, "mesos_api_version": "V1" } }) config.check_running(foldered_name) sdk_install.uninstall(config.PACKAGE_NAME, foldered_name) # reinstall the v0 version for the following tests sdk_install.install( config.PACKAGE_NAME, foldered_name, config.DEFAULT_TASK_COUNT, additional_options={"service": { "name": foldered_name }})
def test_modify_app_config(): check_running() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # All tasks should be updated because hdfs-site.xml has changed tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids) check_running()
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_tasks.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_tasks.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_executor_killed(): world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() tasks.kill_task_with_pattern( 'helloworld.executor.Main', 'world-0-server.{}.mesos'.format(PACKAGE_NAME)) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() hosts = shakedown.get_service_ips(PACKAGE_NAME) [tasks.kill_task_with_pattern('world', h) for h in hosts] tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() tasks.kill_task_with_pattern( 'world', 'world-0-server.{}.mesos'.format(PACKAGE_NAME)) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_kill_hello_node(): check_running() hello_ids = sdk_tasks.get_task_ids(PACKAGE_NAME, 'hello-0') sdk_tasks.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos') sdk_tasks.check_tasks_updated(PACKAGE_NAME, 'hello', hello_ids) check_running()
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_executor_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern( 'helloworld.executor.Main', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.PACKAGE_NAME) env = marathon_config['env'] env['SDK_UNINSTALL'] = 'w00t' sdk_marathon.update_app(config.PACKAGE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.PACKAGE_NAME) sdk_tasks.check_running(config.PACKAGE_NAME, 0)
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] env["SDK_UNINSTALL"] = "w00t" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0, allow_more=False)
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config['env'] env['SDK_UNINSTALL'] = 'w00t' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') sdk_utils.out('hello ids: ' + str(hello_ids)) marathon.bump_task_count_config(PACKAGE_NAME, 'HELLO_COUNT') check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_bump_hello_nodes(): config.check_running(FOLDERED_SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'hello') log.info('hello ids: ' + str(hello_ids)) sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'HELLO_COUNT') config.check_running(FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'hello', hello_ids)
def test_bump_hello_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') log.info('hello ids: ' + str(hello_ids)) sdk_marathon.bump_task_count_config(foldered_name, 'HELLO_COUNT') config.check_running(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', hello_ids)
def test_pod_replace(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running()
def test_finish_task_restarts_on_config_update(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_name = "world-0-finish" world_finish_id = get_completed_task_id(task_name) assert world_finish_id is not None log.info("%s ID: %s", task_name, world_finish_id) config.bump_world_cpus(foldered_name) sdk_tasks.check_task_relaunched(task_name, world_finish_id, ensure_new_task_not_completed=False) config.check_running(foldered_name)
def test_bump_hello_cpus(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) config['env']['HELLO_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'hello', hello_ids) check_running()
def test_finish_task_restarts_on_config_update(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_name = 'world-0-finish' world_finish_id = sdk_tasks.get_completed_task_id(task_name) assert world_finish_id is not None log.info('world_finish_id: ' + str(world_finish_id)) updated_cpus = config.bump_world_cpus(foldered_name) sdk_tasks.check_task_relaunched(task_name, world_finish_id) config.check_running(foldered_name)
def test_bump_world_cpus(): check_running() world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') print('world ids: ' + str(world_ids)) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['WORLD_CPUS']) config['env']['WORLD_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(node_count) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(node_count) marathon.update_app(PACKAGE_NAME, config) check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_integrity_on_data_node_failure(): shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC) # gives chance for write to succeed and replication to occur time.sleep(5) tasks.kill_task_with_pattern("DataNode", 'data-0-node.hdfs.mesos') tasks.kill_task_with_pattern("DataNode", 'data-1-node.hdfs.mesos') time.sleep(1) # give DataNode a chance to die shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC) check_running()
def test_integrity_on_name_node_failure(): """ The first name node (name-0-node) is the active name node by default when HDFS gets installed. This test checks that it is possible to write and read data after the first name node fails. """ tasks.kill_task_with_pattern("NameNode", 'name-0-node.hdfs.mesos') time.sleep(1) # give NameNode a chance to die shakedown.wait_for(lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC) shakedown.wait_for(lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC) check_running()
def test_once_task_does_not_restart_on_config_update(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) task_name = 'hello-0-once' hello_once_id = sdk_tasks.get_completed_task_id(task_name) assert hello_once_id is not None log.info('hello_once_id: ' + str(hello_once_id)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_task_not_relaunched(foldered_name, task_name, hello_once_id) config.check_running(foldered_name)
def test_pod_replace(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') # get current agent id (TODO: uncomment if/when agent is guaranteed to change in a replace operation): #jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod info world-0', json=True) #old_agent = jsonobj[0]['info']['slaveId']['value'] jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running()
def test_bump_world_cpus(): config.check_running(FOLDERED_SERVICE_NAME) world_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'world') log.info('world ids: ' + str(world_ids)) updated_cpus = config.bump_world_cpus(FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'world', world_ids) config.check_running(FOLDERED_SERVICE_NAME) all_tasks = shakedown.get_service_tasks(FOLDERED_SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(FOLDERED_SERVICE_NAME) for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find ONLY one scheduler task but found {}".format(scheduler_ids) sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(config.SERVICE_NAME) config.check_running() sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
def test_bump_hello_cpus(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') log.info('hello ids: ' + str(hello_ids)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'hello', hello_ids) config.check_running(foldered_name) all_tasks = shakedown.get_service_tasks(foldered_name) running_tasks = [t for t in all_tasks if t['name'].startswith('hello') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.hello_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t['resources']['cpus'], updated_cpus)
def test_state_refresh_disable_cache(): """Disables caching via a scheduler envvar""" config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, "") # caching enabled by default: rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") assert rc == 0, "Refresh cache failed" assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config["env"]["DISABLE_STATE_CACHE"] = "any-text-here" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): rc, stdout, stderr = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "debug state refresh_cache" ) return rc != 0 and stdout == "" and "failed: 409 Conflict" in stderr check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config["env"]["DISABLE_STATE_CACHE"] sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry(wait_fixed=1000, stop_max_delay=120 * 1000, retry_on_result=lambda res: not res) def check_cache_refresh(): rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "debug state refresh_cache" ) assert rc == 0, "Refresh cache failed" return stdout stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_config_update_while_partitioned(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_hosts.system_host(config.SERVICE_NAME, "world-0-server") shakedown.partition_agent(host) service_config = sdk_marathon.get_config(config.SERVICE_NAME) updated_cpus = float(service_config['env']['WORLD_CPUS']) + 0.1 service_config['env']['WORLD_CPUS'] = str(updated_cpus) sdk_marathon.update_app(config.SERVICE_NAME, service_config, wait_for_completed_deployment=False) shakedown.reconnect_agent(host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running() all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME) for t in running_tasks: assert config.close_enough(t['resources']['cpus'], updated_cpus)
def test_bump_hello_cpus(): def close_enough(val0, val1): epsilon = 0.00001 diff = abs(val0 - val1) return diff < epsilon config.check_running(config.SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello') log.info('hello ids: ' + str(hello_ids)) updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', hello_ids) config.check_running(config.SERVICE_NAME) all_tasks = shakedown.get_service_tasks(config.SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('hello') and t['state'] == "TASK_RUNNING"] for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def test_mesos_v1_api(): # Install Hello World using the v1 api. # Then, clean up afterwards. sdk_install.uninstall(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME) sdk_install.install( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options={"service": {"name": FOLDERED_SERVICE_NAME, "mesos_api_version": "V1"}} ) config.check_running(FOLDERED_SERVICE_NAME) sdk_install.uninstall(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME) # reinstall the v0 version for the following tests sdk_install.install( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options={"service": {"name": FOLDERED_SERVICE_NAME}})
def test_bump_hello_cpus(): def close_enough(val0, val1): epsilon = 0.00001 diff = abs(val0 - val1) return diff < epsilon config.check_running(config.SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello") log.info("hello ids: " + str(hello_ids)) updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", hello_ids) config.check_running(config.SERVICE_NAME) all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] for t in running_tasks: assert close_enough(t.resources["cpus"], updated_cpus)