Exemplo n.º 1
0
def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False):
    sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60)
    if recovery_expected:
        # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks
        sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60)
    sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60)
    sdk_tasks.check_running(service_name, count)
Exemplo n.º 2
0
    def restart_zookeeper_node(id: int):
        sdk_cmd.svc_cli(config.ZOOKEEPER_PACKAGE_NAME,
                        config.ZOOKEEPER_SERVICE_NAME,
                        "pod restart zookeeper-{}".format(id))

        sdk_plan.wait_for_kicked_off_recovery(config.ZOOKEEPER_SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(config.ZOOKEEPER_SERVICE_NAME)
def test_node_replace_replaces_seed_node():
    pod_to_replace = 'node-0'

    # start replace and wait for it to finish
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
Exemplo n.º 4
0
def test_shutdown_host():
    replace_task = sdk_tasks.get_task_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert replace_task is not None, 'Could not find a node to shut down'
    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Exemplo n.º 5
0
def test_auto_replace_on_decommission():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")
    )

    assert len(candidate_tasks) != 0, "Could not find a node to decommission"

    # Pick the host of the first task from the above list
    replace_agent_id = candidate_tasks[0].agent_id
    replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id]
    log.info(
        "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks)
    )
    sdk_agents.decommission_agent(replace_agent_id)

    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    new_tasks = sdk_tasks.get_summary()

    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == 'node-2-server'][0]
    log.info('avoid host for task {}'.format(replace_task))

    replace_pod_name = replace_task.name[:-len('-server')]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT']
    try:
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host)
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
        sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

    finally:
        # revert to prior placement setting before proceeding with tests: avoid getting stuck.
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_auto_replace_on_drain():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$")
    )

    log.info("Candidate tasks: {}".format(candidate_tasks))
    assert len(candidate_tasks) != 0, "Could not find a node to drain"

    # Pick the host of the first task from the above list
    replace_agent_id = candidate_tasks[0].agent_id
    replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id]
    log.info(
        "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks)
    )
    sdk_agents.drain_agent(replace_agent_id)

    sdk_plan.wait_for_kicked_off_recovery(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    new_tasks = sdk_tasks.get_summary()

    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id

    # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx
    sdk_agents.reactivate_agent(replace_agent_id)
Exemplo n.º 8
0
def test_node_replace_replaces_seed_node():
    pod_to_replace = 'node-0'

    # start replace and wait for it to finish
    cmd.run_cli('cassandra pod replace {}'.format(pod_to_replace))
    sdk_plan.wait_for_kicked_off_recovery(PACKAGE_NAME)
    sdk_plan.wait_for_completed_recovery(PACKAGE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_node_replace_replaces_node() -> None:
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == "node-2-server"
    ][0]
    log.info("avoid host for task {}".format(replace_task))

    replace_pod_name = replace_task.name[:-len("-server")]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"]
    try:
        marathon_config["env"][
            "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format(
                replace_task.host)
        sdk_marathon.update_app(marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                        "pod replace {}".format(replace_pod_name))
        sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(
            config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

    finally:
        # revert to prior placement setting before proceeding with tests: avoid getting stuck.
        marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint
        sdk_marathon.update_app(marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
Exemplo n.º 10
0
def test_hostname_unique():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    options = _escape_placement_for_1_9({
        "service": {
            "spec_file": "examples/marathon_constraint.yml"
        },
        "hello": {
            "count": config.get_num_private_agents(),
            "placement": "[[\"hostname\", \"UNIQUE\"]]"
        },
        "world": {
            "count": config.get_num_private_agents(),
            "placement": "[[\"hostname\", \"UNIQUE\"]]"
        }
    })

    sdk_install.install(config.PACKAGE_NAME,
                        config.SERVICE_NAME,
                        config.get_num_private_agents() * 2,
                        additional_options=options)
    # hello deploys first. One "world" task should end up placed with each "hello" task.
    # ensure "hello" task can still be placed with "world" task
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace hello-0')
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME,
                            config.get_num_private_agents() * 2 - 1,
                            timeout_seconds=10)
    sdk_tasks.check_running(config.SERVICE_NAME,
                            config.get_num_private_agents() * 2)
    ensure_count_per_agent(hello_count=1, world_count=1)
Exemplo n.º 11
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Exemplo n.º 12
0
def test_hostname_unique():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    options = _escape_placement_for_1_9({
        "service": {
            "yaml": "marathon_constraint"
        },
        "hello": {
            "count": config.get_num_private_agents(),
            "placement": "[[\"hostname\", \"UNIQUE\"]]"
        },
        "world": {
            "count": config.get_num_private_agents(),
            "placement": "[[\"hostname\", \"UNIQUE\"]]"
        }
    })

    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME,
        config.get_num_private_agents() * 2, additional_options=options)
    # hello deploys first. One "world" task should end up placed with each "hello" task.
    # ensure "hello" task can still be placed with "world" task
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0')
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2 - 1, timeout_seconds=10)
    sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2)
    ensure_count_per_agent(hello_count=1, world_count=1)
Exemplo n.º 13
0
def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False):
    sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60)
    if recovery_expected:
        # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks
        sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60)
    sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60)
    sdk_tasks.check_running(service_name, count)
Exemplo n.º 14
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
    pods_with_updated_tasks: Optional[List[str]] = None,
) -> None:
    """
    Perform a replace (permanent recovery) operation on the specified pod.

    The specified pod AND any additional pods in `pods_with_updated_tasks` are
    checked to ensure that their tasks have been restarted.

    Any remaining pods are checked to ensure that their tasks are not changed.

    For example, performing a pod replace kafka-0 on a Kafka framework should
    result in ONLY the kafa-0-broker task being restarted. In this case,
    pods_with_updated_tasks is specified as None.

    When performing a pod replace operation on a Cassandra seed node (node-0),
    a rolling restart of other nodes is triggered, and
    pods_with_updated_tasks = ["node-0", "node-1", "node-2"]
    (assuming a three node Cassandra ring)
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list")
    assert rc == 0, "Pod list failed"
    pod_list = set(json.loads(stdout))

    pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else []
    pods_to_update = set(pods_with_updated_tasks + [pod_name])

    tasks_to_replace = {}
    for pod in pods_to_update:
        tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name))

    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list - pods_to_update:
        tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods)

    sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name))

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    for pod, tasks in tasks_to_replace.items():
        sdk_tasks.check_tasks_updated(service_name, pod, tasks)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
Exemplo n.º 15
0
def test_node_replace_replaces_seed_node():
    pod_to_replace = 'node-0'

    # start replace and wait for it to finish
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(pod_to_replace))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(
        config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
Exemplo n.º 16
0
def test_envvar_accross_restarts():
    class ConfigException(Exception):
        pass

    def assert_envvar_has_value(envvar: str, expected_value: str):
        _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME,
                                                 "hello-0-server", "env")
        env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n'))
        val = env.get(envvar, "absent")

        if val == "absent":
            raise ConfigException("Required envvar not found")

        if val != expected_value:
            log.error("Looking for %s=%s but found: %s", envvar,
                      expected_value, val)
            raise ConfigException("Envvar not set to required value")

        log.info("%s has expected value %s", envvar, expected_value)

    envvar = "CONFIG_SLEEP_DURATION"
    sleep_duration = 9999

    try:
        assert_envvar_has_value(envvar, str(sleep_duration))
    except ConfigException:
        log.debug("%s is set to something other than %d as expected", envvar,
                  sleep_duration)

    sdk_upgrade.update_or_upgrade_or_downgrade(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        to_version=None,
        to_options={
            "service": {
                "name": config.SERVICE_NAME,
                "sleep": sleep_duration,
                "yaml": "sidecar"
            }
        },
        expected_running_tasks=2,
        wait_for_deployment=True,
    )

    log.info("Checking after update")
    assert_envvar_has_value(envvar, str(sleep_duration))

    cmd_list = ["pod", "restart", "hello-0"]
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    " ".join(cmd_list))

    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    log.info("Checking after restart")
    assert_envvar_has_value(envvar, str(sleep_duration))
Exemplo n.º 17
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")
    )
    assert len(candidate_tasks) != 0, "Could not find a node to shut down"

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname]
    log.info(
        "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks)
    )

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_agents.shutdown_agent(replace_hostname)
    # Reserved resources on this agent are expected to appear as orphaned in Mesos state.
    # Tell our uninstall validation to ignore orphaned resources coming from this agent.
    sdk_install.ignore_dead_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[: -len("-server")] for task in replace_tasks])
    assert len(replace_pods) == len(
        replace_tasks
    ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id
Exemplo n.º 18
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$"))
    assert len(candidate_tasks) != 0, "Could not find a node to shut down"

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [
        task for task in candidate_tasks if task.host == replace_hostname
    ]
    log.info("Tasks on host {} to be replaced after shutdown: {}".format(
        replace_hostname, replace_tasks))

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_agents.shutdown_agent(replace_hostname)
    # Reserved resources on this agent are expected to appear as orphaned in Mesos state.
    # Tell our uninstall validation to ignore orphaned resources coming from this agent.
    sdk_install.ignore_dead_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[:-len("-server")] for task in replace_tasks])
    assert len(replace_pods) == len(
        replace_tasks
    ), "Expected one task per pod in tasks to replace: {}".format(
        replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                        "pod replace {}".format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info("Checking affected task has moved to a new agent:\n"
                 "old={}\nnew={}".format(replaced_task, new_task))
        assert replaced_task.agent_id != new_task.agent_id
Exemplo n.º 19
0
def test_envvar_accross_restarts():

    class ConfigException(Exception):
        pass

    def assert_envvar_has_value(envvar: str, expected_value: str):
        _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env")
        env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n'))
        val = env.get(envvar, "absent")

        if val == "absent":
            raise ConfigException("Required envvar not found")

        if val != expected_value:
            log.error("Looking for %s=%s but found: %s", envvar, expected_value, val)
            raise ConfigException("Envvar not set to required value")

        log.info("%s has expected value %s", envvar, expected_value)

    envvar = "CONFIG_SLEEP_DURATION"
    sleep_duration = 9999

    try:
        assert_envvar_has_value(envvar, str(sleep_duration))
    except ConfigException:
        log.debug("%s is set to something other than %d as expected", envvar, sleep_duration)

    sdk_upgrade.update_or_upgrade_or_downgrade(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        to_version=None,
        to_options={
            "service": {"name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar"}
        },
        expected_running_tasks=2,
        wait_for_deployment=True,
    )

    log.info("Checking after update")
    assert_envvar_has_value(envvar, str(sleep_duration))

    cmd_list = ["pod", "restart", "hello-0"]
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list))

    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    log.info("Checking after restart")
    assert_envvar_has_value(envvar, str(sleep_duration))
Exemplo n.º 20
0
def test_node_replace_replaces_node():
    pod_to_replace = 'node-2'
    pod_host = get_pod_host(pod_to_replace)
    log.info('avoid host for pod {}: {}'.format(pod_to_replace, pod_host))

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['PLACEMENT_CONSTRAINT'] = 'hostname:UNLIKE:{}'.format(pod_host)
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # start replace and wait for it to finish
    cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
Exemplo n.º 21
0
def test_config_update_across_restart():
    foldered_service_name = config.get_foldered_service_name()

    batch_size_warn_threshold_in_kb = 15
    sdk_upgrade.update_or_upgrade_or_downgrade(
        config.PACKAGE_NAME,
        foldered_service_name,
        to_package_version=None,
        additional_options={
            "service": {"name": foldered_service_name},
            "cassandra": {"batch_size_warn_threshold_in_kb": batch_size_warn_threshold_in_kb},
        },
        expected_running_tasks=config.DEFAULT_TASK_COUNT,
        wait_for_deployment=True,
        timeout_seconds=config.DEFAULT_CASSANDRA_TIMEOUT,
    )

    for _ in range(3):
        cmd_list = ["pod", "restart", "node-0"]
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_service_name, " ".join(cmd_list))

        sdk_plan.wait_for_kicked_off_recovery(foldered_service_name)
        sdk_plan.wait_for_completed_recovery(
            foldered_service_name, timeout_seconds=config.DEFAULT_CASSANDRA_TIMEOUT
        )

        _, stdout, _ = sdk_cmd.service_task_exec(foldered_service_name, "node-0-server", "env")

        envvar = "CASSANDRA_BATCH_SIZE_WARN_THRESHOLD_IN_KB="
        envvar_pos = stdout.find(envvar)
        if envvar_pos < 0:
            raise Exception("Required envvar not found")

        if not stdout[envvar_pos + len(envvar) :].startswith(
            "{}".format(batch_size_warn_threshold_in_kb)
        ):
            found_string = stdout[envvar_pos + len(envvar) : envvar_pos + len(envvar) + 15]
            log.error(
                "Looking for %s%d but found: %s",
                envvar,
                batch_size_warn_threshold_in_kb,
                found_string,
            )
            raise Exception("Envvar not set to required value")
Exemplo n.º 22
0
def test_replace_pods_to_legacy_role():

    # Issue pod replace operations till we move the pods to the legacy role.
    replace_pods = ["hello-0", "world-0", "world-1"]

    for pod in replace_pods:
        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, SERVICE_NAME,
                        "pod replace {}".format(pod))
        sdk_plan.wait_for_kicked_off_recovery(
            SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
        sdk_plan.wait_for_completed_recovery(
            SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

        # Get the current service state to verify roles have applied.
        service_roles = sdk_utils.get_service_roles(SERVICE_NAME)
        current_task_roles = service_roles["task-roles"]
        task_name = "{}-server".format(pod)

        # Ensure we have transitioned over to the legacy role.
        assert current_task_roles[task_name] == LEGACY_ROLE

    # Get refreshed roles after pod replace's
    service_roles = sdk_utils.get_service_roles(SERVICE_NAME)
    current_task_roles = service_roles["task-roles"]

    # We must have some role!
    assert len(current_task_roles) > 0

    assert LEGACY_ROLE in current_task_roles.values()
    assert ENFORCED_ROLE not in current_task_roles.values()

    # Ensure we're MULTI_ROLE
    assert service_roles["framework-roles"] is not None
    assert service_roles["framework-role"] is None

    assert len(service_roles["framework-roles"]) == 2
    assert LEGACY_ROLE in service_roles["framework-roles"]
    assert ENFORCED_ROLE in service_roles["framework-roles"]
Exemplo n.º 23
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
):
    """
    Perform a replace operation on a specified pod and check that it is replaced

    All other pods are checked to see if they remain consistent.
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    pod_list = sdk_cmd.svc_cli(package_name, service_name, "pod list", json=True)

    tasks_to_replace = set(sdk_tasks.get_task_ids(service_name, pod_name))
    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list:
        if pod == pod_name:
            continue
        tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods)

    replace_cmd = ["pod", "replace", pod_name]
    sdk_cmd.svc_cli(package_name, service_name, " ".join(replace_cmd), json=True)

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    sdk_tasks.check_tasks_updated(service_name, pod_name, tasks_to_replace)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
Exemplo n.º 24
0
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == 'node-2-server'
    ][0]
    log.info('avoid host for task {}'.format(replace_task))

    replace_pod_name = replace_task.name[:-len('-server')]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env'][
        'PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(
            replace_task.host)
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # start replace and wait for it to finish
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(
        config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^node-[0-9]+-server$"))
    assert len(candidate_tasks) != 0, "Could not find a node to shut down"
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(
        set([task.host for task in candidate_tasks])
    ), "Expected candidate tasks to all be on different hosts: {}".format(
        candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len("-server")]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_agents.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    "pod replace {}".format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info("Checking that the original pod has moved to a new agent:\n"
             "old={}\nnew={}".format(replace_task, new_task))
    assert replace_task.agent_id != new_task.agent_id
Exemplo n.º 26
0
    def restart_zookeeper_node(id: int):
        sdk_cmd.svc_cli(config.ZOOKEEPER_PACKAGE_NAME, config.ZOOKEEPER_SERVICE_NAME,
                        "pod restart zookeeper-{}".format(id))

        sdk_plan.wait_for_kicked_off_recovery(config.ZOOKEEPER_SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(config.ZOOKEEPER_SERVICE_NAME)