def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False): sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60) if recovery_expected: # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60) sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60) sdk_tasks.check_running(service_name, count)
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def _replace_data_nodes_and_wait_for_completed_recovery(nodes: int) -> None: for d in range(nodes): sdk_cmd.svc_cli(package_name, service_name, "pod replace data-{}".format(d)) sdk_plan.wait_for_completed_recovery(service_name) config.wait_for_expected_nodes_to_exist( service_name=service_name, task_count=ALL_NODES_NUMBER )
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9({ "service": { "yaml": "marathon_constraint" }, "hello": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" }, "world": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" } }) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.get_num_private_agents() * 2, additional_options=options) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2 - 1, timeout_seconds=10) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, } ) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, get_num_private_agents() * 2, additional_options=options, ) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running( config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10 ) sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == "node-2-server" ][0] log.info("avoid host for task {}".format(replace_task)) replace_pod_name = replace_task.name[:-len("-server")] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"] try: marathon_config["env"][ "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_shutdown_host(): # Print a dump of current tasks in the cluster (and what agents they're on) sdk_cmd.run_cli('task') replace_pod = get_pod_to_replace() assert replace_pod is not None, 'Could not find a node to shut down' # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_pod['host']) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod['name'])) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Another dump of current cluster tasks, now that repair has started. sdk_cmd.run_cli('task') sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # One last task dump for good measure. sdk_cmd.run_cli('task') new_agent = get_pod_agent(replace_pod['name']) log.info('Checking that the original pod has moved to a new agent:\n' 'old_pod={}\nnew_agent={}'.format(replace_pod, new_agent)) assert replace_pod['agent'] != new_agent
def test_shutdown_host(): replace_task = sdk_tasks.get_task_avoiding_scheduler( config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$')) assert replace_task is not None, 'Could not find a node to shut down' replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_envvar_accross_restarts(): sleep_duration = 9999 sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, config.SERVICE_NAME, to_package_version=None, additional_options={ "service": {"name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar"} }, expected_running_tasks=2, wait_for_deployment=True, ) for attempt in range(3): cmd_list = ["pod", "restart", "hello-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env") envvar = "CONFIG_SLEEP_DURATION=" envvar_pos = stdout.find(envvar) if envvar_pos < 0: raise Exception("Required envvar not found") if not stdout[envvar_pos + len(envvar) :].startswith("{}".format(sleep_duration)): found_string = stdout[envvar_pos + len(envvar) : envvar_pos + len(envvar) + 15] log.error( "(%d) Looking for %s%d but found: %s", attempt, envvar, sleep_duration, found_string ) raise Exception("Envvar not set to required value")
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [ task for task in candidate_tasks if task.agent_id == replace_agent_id ] log.info("Tasks on agent {} to be replaced after decommission: {}".format( replace_agent_id, replace_tasks)) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server'][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT'] try: marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def restart_zookeeper_node(id: int): sdk_cmd.svc_cli(config.ZOOKEEPER_PACKAGE_NAME, config.ZOOKEEPER_SERVICE_NAME, "pod restart zookeeper-{}".format(id)) sdk_plan.wait_for_kicked_off_recovery(config.ZOOKEEPER_SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.ZOOKEEPER_SERVICE_NAME)
def check_tasks_not_updated(service_name, prefix, old_task_ids): sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) task_ids = get_task_ids(service_name, prefix) task_sets = "\n- Old tasks: {}\n- Current tasks: {}".format(sorted(old_task_ids), sorted(task_ids)) log.info('Checking tasks starting with "{}" have not been updated:{}'.format(prefix, task_sets)) assert set(old_task_ids).issubset(set(task_ids)), "Tasks got updated:{}".format(task_sets)
def test_auto_replace_on_drain(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$") ) log.info("Candidate tasks: {}".format(candidate_tasks)) assert len(candidate_tasks) != 0, "Could not find a node to drain" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.drain_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx sdk_agents.reactivate_agent(replace_agent_id)
def test_master_node_replace(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace master-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_node_replace() -> None: # Ideally, the pod will get placed on a different agent. This test will verify that the # remaining two masters find the replaced master at its new IP address. This requires a # reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(package_name, service_name, "pod replace master-0") sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_metrics() -> None: expected_metrics = [ "node.data-0-node.fs.total.total_in_bytes", "node.data-0-node.jvm.mem.pools.old.peak_used_in_bytes", "node.data-0-node.jvm.threads.count", ] def expected_metrics_exist(emitted_metrics: List[str]) -> bool: # Elastic metrics are also dynamic and based on the service name# For eg: # elasticsearch.test__integration__elastic.node.data-0-node.thread_pool.listener.completed # To prevent this from breaking we drop the service name from the metric name # => data-0-node.thread_pool.listener.completed metric_names = [".".join(metric_name.split(".")[2:]) for metric_name in emitted_metrics] return sdk_metrics.check_metrics_presence(metric_names, expected_metrics) sdk_metrics.wait_for_service_metrics( package_name, service_name, "data-0", "data-0-node", config.DEFAULT_TIMEOUT, expected_metrics_exist, ) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_master_node_replace() -> None: # Ideally, the pod will get placed on a different agent. This test will verify that the # remaining two masters find the replaced master at its new IP address. This requires a # reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(package_name, service_name, "pod replace master-0") sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_metrics(): expected_metrics = [ "node.data-0-node.fs.total.total_in_bytes", "node.data-0-node.jvm.mem.pools.old.peak_used_in_bytes", "node.data-0-node.jvm.threads.count", ] def expected_metrics_exist(emitted_metrics): # Elastic metrics are also dynamic and based on the service name# For eg: # elasticsearch.test__integration__elastic.node.data-0-node.thread_pool.listener.completed # To prevent this from breaking we drop the service name from the metric name # => data-0-node.thread_pool.listener.completed metric_names = [ ".".join(metric_name.split(".")[2:]) for metric_name in emitted_metrics ] return sdk_metrics.check_metrics_presence(metric_names, expected_metrics) sdk_metrics.wait_for_service_metrics( config.PACKAGE_NAME, foldered_name, "data-0", "data-0-node", config.DEFAULT_TIMEOUT, expected_metrics_exist, ) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def check_tasks_not_updated(service_name, prefix, old_task_ids): sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) task_ids = get_task_ids(service_name, prefix) task_sets = "\n- Old tasks: {}\n- Current tasks: {}".format(sorted(old_task_ids), sorted(task_ids)) log.info('Checking tasks starting with "{}" have not been updated:{}'.format(prefix, task_sets)) assert set(old_task_ids).issubset(set(task_ids)), 'Tasks starting with "{}" were updated:{}'.format(prefix, task_sets)
def test_indexing(default_populated_index: None) -> None: indices_stats = config.get_elasticsearch_indices_stats(index_name, service_name=service_name) assert indices_stats["_all"]["primaries"]["docs"]["count"] == 1 doc = config.get_document(index_name, index_type, 1, service_name=service_name) assert doc["_source"]["name"] == "Loren" sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_losing_and_regaining_index_health(default_populated_index): config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, "data-0-node"), "data__.*Elasticsearch") config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name) config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_indexing(default_populated_index): indices_stats = config.get_elasticsearch_indices_stats(config.DEFAULT_INDEX_NAME, service_name=foldered_name) assert indices_stats["_all"]["primaries"]["docs"]["count"] == 1 doc = config.get_document(config.DEFAULT_INDEX_NAME, config.DEFAULT_INDEX_TYPE, 1, service_name=foldered_name) assert doc["_source"]["name"] == "Loren" sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_mesos_v0_api(): prior_api_version = sdk_marathon.get_mesos_api_version(foldered_name) if prior_api_version is not "V0": sdk_marathon.set_mesos_api_version(foldered_name, "V0") sdk_marathon.set_mesos_api_version(foldered_name, prior_api_version) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_plugin_install_and_uninstall(default_populated_index): plugin_name = 'analysis-phonetic' config.update_app(foldered_name, {'TASKCFG_ALL_ELASTICSEARCH_PLUGINS': plugin_name}, current_expected_task_count) config.check_elasticsearch_plugin_installed(plugin_name, service_name=foldered_name) config.update_app(foldered_name, {'TASKCFG_ALL_ELASTICSEARCH_PLUGINS': ''}, current_expected_task_count) config.check_elasticsearch_plugin_uninstalled(plugin_name, service_name=foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) assert get_task_host("hello-0-server") == other_agent
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) assert get_task_host('hello-0-server') == other_agent
def test_envvar_accross_restarts(): class ConfigException(Exception): pass def assert_envvar_has_value(envvar: str, expected_value: str): _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env") env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n')) val = env.get(envvar, "absent") if val == "absent": raise ConfigException("Required envvar not found") if val != expected_value: log.error("Looking for %s=%s but found: %s", envvar, expected_value, val) raise ConfigException("Envvar not set to required value") log.info("%s has expected value %s", envvar, expected_value) envvar = "CONFIG_SLEEP_DURATION" sleep_duration = 9999 try: assert_envvar_has_value(envvar, str(sleep_duration)) except ConfigException: log.debug("%s is set to something other than %d as expected", envvar, sleep_duration) sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, config.SERVICE_NAME, to_version=None, to_options={ "service": { "name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar" } }, expected_running_tasks=2, wait_for_deployment=True, ) log.info("Checking after update") assert_envvar_has_value(envvar, str(sleep_duration)) cmd_list = ["pod", "restart", "hello-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) log.info("Checking after restart") assert_envvar_has_value(envvar, str(sleep_duration))
def test_endpoints(): # check that we can reach the scheduler via admin router, and that returned endpoints are sanitized: for endpoint in config.ENDPOINT_TYPES: endpoints = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints {}'.format(endpoint), json=True) host = endpoint.split('-')[0] # 'coordinator-http' => 'coordinator' assert endpoints['dns'][0].startswith(sdk_hosts.autoip_host(foldered_name, host + '-0-node')) assert endpoints['vip'].startswith(sdk_hosts.vip_host(foldered_name, host)) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_updated_placement_constraints_restarted_tasks_dont_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Restart the task, and verify it doesn't move hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) assert get_task_host("hello-0-server") == some_agent
def test_master_reelection(): initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def check_task_not_relaunched(service_name, task_name, old_task_id, timeout_seconds=DEFAULT_TIMEOUT_SECONDS): sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) try: task_ids = set([t['id'] for t in shakedown.get_tasks() if t['name'] == task_name]) except dcos.errors.DCOSHTTPException: log.info('Failed to get task ids for service {}'.format(service_name)) task_ids = set([]) assert len(task_ids) == 1 and old_task_id in task_ids
def check_task_not_relaunched(service_name, task_name, old_task_id, timeout_seconds=DEFAULT_TIMEOUT_SECONDS): sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) try: task_ids = set([t['id'] for t in shakedown.get_tasks() if t['name'] == task_name]) except dcos.errors.DCOSHTTPException: log.info('Failed to get task ids for service {}'.format(service_name)) task_ids = set([]) assert len(task_ids) == 1 and old_task_id in task_ids
def test_endpoints() -> None: # Check that we can reach the scheduler via admin router, and that returned endpoints are # sanitized. for endpoint in config.ENDPOINT_TYPES: endpoints = sdk_networks.get_endpoint(package_name, service_name, endpoint) host = endpoint.split("-")[0] # 'coordinator-http' => 'coordinator' assert endpoints["dns"][0].startswith(sdk_hosts.autoip_host(service_name, host + "-0-node")) assert endpoints["vip"].startswith(sdk_hosts.vip_host(service_name, host)) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def check_tasks_not_updated(service_name: str, prefix: str, old_task_ids: Iterable[str]) -> None: sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) wait_for_active_framework(service_name) task_ids = get_task_ids(service_name, prefix) task_sets = "\n- Old tasks: {}\n- Current tasks: {}".format( sorted(old_task_ids), sorted(task_ids) ) log.info('Checking tasks starting with "{}" have not been updated:{}'.format(prefix, task_sets)) assert set(old_task_ids).issubset( set(task_ids) ), 'Tasks starting with "{}" were updated:{}'.format(prefix, task_sets)
def test_losing_and_regaining_index_health(default_populated_index: None) -> None: config.check_elasticsearch_index_health(index_name, "green", service_name=service_name) sdk_cmd.kill_task_with_pattern( "data__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(service_name, "data-0-node")[0].host, ) config.check_elasticsearch_index_health(index_name, "yellow", service_name=service_name) config.check_elasticsearch_index_health(index_name, "green", service_name=service_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_indexing(default_populated_index: None) -> None: indices_stats = config.get_elasticsearch_indices_stats( index_name, service_name=service_name) assert indices_stats["_all"]["primaries"]["docs"]["count"] == 1 doc = config.get_document(index_name, index_type, 1, service_name=service_name) assert doc["_source"]["name"] == "Loren" sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname] log.info( "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks) ) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[: -len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_envvar_accross_restarts(): class ConfigException(Exception): pass def assert_envvar_has_value(envvar: str, expected_value: str): _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env") env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n')) val = env.get(envvar, "absent") if val == "absent": raise ConfigException("Required envvar not found") if val != expected_value: log.error("Looking for %s=%s but found: %s", envvar, expected_value, val) raise ConfigException("Envvar not set to required value") log.info("%s has expected value %s", envvar, expected_value) envvar = "CONFIG_SLEEP_DURATION" sleep_duration = 9999 try: assert_envvar_has_value(envvar, str(sleep_duration)) except ConfigException: log.debug("%s is set to something other than %d as expected", envvar, sleep_duration) sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, config.SERVICE_NAME, to_version=None, to_options={ "service": {"name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar"} }, expected_running_tasks=2, wait_for_deployment=True, ) log.info("Checking after update") assert_envvar_has_value(envvar, str(sleep_duration)) cmd_list = ["pod", "restart", "hello-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) log.info("Checking after restart") assert_envvar_has_value(envvar, str(sleep_duration))
def test_bump_node_counts(): # bump ingest and coordinator, but NOT data, which is bumped in the following test. # we want to avoid adding two data nodes because the cluster sometimes won't have enough room for it marathon_config = sdk_marathon.get_config(foldered_name) ingest_nodes = int(marathon_config['env']['INGEST_NODE_COUNT']) marathon_config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(marathon_config['env']['COORDINATOR_NODE_COUNT']) marathon_config['env']['COORDINATOR_NODE_COUNT'] = str(coordinator_nodes + 1) sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) global current_expected_task_count current_expected_task_count += 2 sdk_tasks.check_running(foldered_name, current_expected_task_count) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_master_reelection() -> None: initial_master = config.get_elasticsearch_master(service_name=service_name) sdk_cmd.kill_task_with_pattern( "master__.*Elasticsearch", "nobody", agent_host=sdk_tasks.get_service_tasks(service_name, initial_master)[0].host, ) sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) config.wait_for_expected_nodes_to_exist(service_name=service_name) new_master = config.get_elasticsearch_master(service_name=service_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def test_custom_yaml_base64(): # apply this custom YAML block as a base64-encoded string: # cluster: # routing: # allocation: # node_initial_primaries_recoveries: 3 # The default value is 4. We're just testing to make sure the YAML formatting survived intact and the setting # got updated in the config. base64_str = 'Y2x1c3RlcjoNCiAgcm91dGluZzoNCiAgICBhbGxvY2F0aW9uOg0KIC' \ 'AgICAgbm9kZV9pbml0aWFsX3ByaW1hcmllc19yZWNvdmVyaWVzOiAz' config.update_app(foldered_name, {'CUSTOM_YAML_BLOCK_BASE64': base64_str}, current_expected_task_count) config.check_custom_elasticsearch_cluster_setting(service_name=foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_pod_replace_then_immediate_config_update(): plugin_name = 'analysis-phonetic' cfg = sdk_marathon.get_config(foldered_name) cfg['env']['TASKCFG_ALL_ELASTICSEARCH_PLUGINS'] = plugin_name cfg['env']['UPDATE_STRATEGY'] = 'parallel' sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace data-0') # issue config update immediately sdk_marathon.update_app(foldered_name, cfg) # ensure all nodes, especially data-0, get launched with the updated config config.check_elasticsearch_plugin_installed(plugin_name, service_name=foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)