def test_shutdown_host(): replace_task = sdk_tasks.get_task_avoiding_scheduler( config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$')) assert replace_task is not None, 'Could not find a node to shut down' replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): # Print a dump of current tasks in the cluster (and what agents they're on) sdk_cmd.run_cli('task') replace_pod = get_pod_to_replace() assert replace_pod is not None, 'Could not find a node to shut down' # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_pod['host']) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod['name'])) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Another dump of current cluster tasks, now that repair has started. sdk_cmd.run_cli('task') sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # One last task dump for good measure. sdk_cmd.run_cli('task') new_agent = get_pod_agent(replace_pod['name']) log.info('Checking that the original pod has moved to a new agent:\n' 'old_pod={}\nnew_agent={}'.format(replace_pod, new_agent)) assert replace_pod['agent'] != new_agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [ task for task in candidate_tasks if task.host == replace_hostname ] log.info('Tasks on host {} to be replaced after shutdown: {}'.format( replace_hostname, replace_tasks)) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[:-len('-server')] for task in replace_tasks]) assert len(replace_pods) == len(replace_tasks), \ 'Expected one task per pod in tasks to replace: {}'.format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info('Checking affected task has moved to a new agent:\n' 'old={}\nnew={}'.format(replaced_task, new_task)) assert replaced_task.agent != new_task.agent
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [ task for task in candidate_tasks if task.host == replace_hostname] log.info('Tasks on host {} to be replaced after shutdown: {}'.format(replace_hostname, replace_tasks)) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[:-len('-server')] for task in replace_tasks]) assert len(replace_pods) == len(replace_tasks), \ 'Expected one task per pod in tasks to replace: {}'.format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id][0] log.info('Checking affected task has moved to a new agent:\n' 'old={}\nnew={}'.format(replaced_task, new_task)) assert replaced_task.agent != new_task.agent