Пример #1
0
def test_shutdown_host():
    replace_task = sdk_tasks.get_task_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$'))
    assert replace_task is not None, 'Could not find a node to shut down'
    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Пример #2
0
def test_shutdown_host():
    # Print a dump of current tasks in the cluster (and what agents they're on)
    sdk_cmd.run_cli('task')

    replace_pod = get_pod_to_replace()
    assert replace_pod is not None, 'Could not find a node to shut down'

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_pod['host'])

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod['name']))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Another dump of current cluster tasks, now that repair has started.
    sdk_cmd.run_cli('task')

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # One last task dump for good measure.
    sdk_cmd.run_cli('task')

    new_agent = get_pod_agent(replace_pod['name'])
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old_pod={}\nnew_agent={}'.format(replace_pod, new_agent))
    assert replace_pod['agent'] != new_agent
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Пример #4
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Пример #5
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [
        task for task in candidate_tasks if task.host == replace_hostname
    ]
    log.info('Tasks on host {} to be replaced after shutdown: {}'.format(
        replace_hostname, replace_tasks))

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_hostname)
    # Reserved resources on this agent are expected to appear as orphaned in Mesos state.
    # Tell our uninstall validation to ignore orphaned resources coming from this agent.
    sdk_install.ignore_dead_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[:-len('-server')] for task in replace_tasks])
    assert len(replace_pods) == len(replace_tasks), \
        'Expected one task per pod in tasks to replace: {}'.format(replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                        'pod replace {}'.format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info('Checking affected task has moved to a new agent:\n'
                 'old={}\nnew={}'.format(replaced_task, new_task))
        assert replaced_task.agent != new_task.agent
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [
        task for task in candidate_tasks
        if task.host == replace_hostname]
    log.info('Tasks on host {} to be replaced after shutdown: {}'.format(replace_hostname, replace_tasks))

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[:-len('-server')] for task in replace_tasks])
    assert len(replace_pods) == len(replace_tasks), \
        'Expected one task per pod in tasks to replace: {}'.format(replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id][0]
        log.info('Checking affected task has moved to a new agent:\n'
                 'old={}\nnew={}'.format(replaced_task, new_task))
        assert replaced_task.agent != new_task.agent