def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [ task for task in candidate_tasks if task.agent_id == replace_agent_id ] log.info("Tasks on agent {} to be replaced after decommission: {}".format( replace_agent_id, replace_tasks)) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_auto_replace_on_drain(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$") ) log.info("Candidate tasks: {}".format(candidate_tasks)) assert len(candidate_tasks) != 0, "Could not find a node to drain" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.drain_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx sdk_agents.reactivate_agent(replace_agent_id)
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname] log.info( "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks) ) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[: -len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [ task for task in candidate_tasks if task.host == replace_hostname ] log.info("Tasks on host {} to be replaced after shutdown: {}".format( replace_hostname, replace_tasks)) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[:-len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format( replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id