def test_config_update_then_scheduler_died():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME)
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_config_update_then_kill_task_in_node():
    # kill 1 of 2 world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_kill_hello_node():
    config.check_running()
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0')
    sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids)

    config.check_running()
示例#4
0
def test_config_update_then_kill_task_in_node():
    # kill 1 of 2 world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
示例#5
0
def test_kill_hello_node():
    config.check_running()
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0')
    sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids)

    config.check_running()
示例#6
0
def test_config_update_then_scheduler_died():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME)
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
示例#7
0
def test_config_update_then_executor_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern(
        'helloworld.executor.Main',
        'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_kill_hello_task():
    hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "hello-container-path/output",
        "nobody",
        agent_host=hello_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [hello_task.id])
    check_healthy()
def test_kill_world_executor():
    world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="world-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "mesos-default-executor",
        "nobody",
        agent_host=world_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", [world_task.id])
    check_healthy()
示例#10
0
def test_kill_data_node():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0')
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')

    sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node'))
    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
示例#11
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
示例#12
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
示例#13
0
def test_losing_and_regaining_index_health(default_populated_index: None) -> None:
    config.check_elasticsearch_index_health(index_name, "green", service_name=service_name)
    sdk_cmd.kill_task_with_pattern(
        "data__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(service_name, "data-0-node")[0].host,
    )
    config.check_elasticsearch_index_health(index_name, "yellow", service_name=service_name)
    config.check_elasticsearch_index_health(index_name, "green", service_name=service_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
示例#14
0
def test_kill_all_executors():
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    for task in tasks:
        sdk_cmd.kill_task_with_pattern(
            "mesos-default-executor",
            "nobody",
            agent_host=task.host,
        )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "", [task.id for task in tasks])
    check_healthy()
示例#15
0
def test_kill_hello_task():
    hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME,
                                             task_prefix="hello-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "hello-container-path/output",
        "nobody",
        agent_host=hello_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0",
                                  [hello_task.id])
    check_healthy()
示例#16
0
def test_kill_world_executor():
    world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME,
                                             task_prefix="world-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "mesos-default-executor",
        "nobody",
        agent_host=world_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0",
                                  [world_task.id])
    check_healthy()
示例#17
0
def test_kill_all_executors():
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    for task in tasks:
        sdk_cmd.kill_task_with_pattern(
            "mesos-default-executor",
            "nobody",
            agent_host=task.host,
        )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "",
                                  [task.id for task in tasks])
    check_healthy()
示例#18
0
def test_integrity_on_data_node_failure():
    """
    Verifies proper data replication among data nodes.
    """
    test_filename = get_unique_filename("test_datanode_fail")

    # An HDFS write will only successfully return when the data replication has taken place
    config.write_data_to_hdfs(config.SERVICE_NAME, test_filename)

    sdk_cmd.kill_task_with_pattern("DataNode", sdk_hosts.system_host(config.SERVICE_NAME, 'data-0-node'))
    sdk_cmd.kill_task_with_pattern("DataNode", sdk_hosts.system_host(config.SERVICE_NAME, 'data-1-node'))

    config.read_data_from_hdfs(config.SERVICE_NAME, test_filename)

    config.check_healthy(service_name=config.SERVICE_NAME)
示例#19
0
def test_master_reelection():
    initial_master = config.get_elasticsearch_master(service_name=foldered_name)
    sdk_cmd.kill_task_with_pattern(
        "master__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(foldered_name, initial_master)[0].host,
    )
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
    new_master = config.get_elasticsearch_master(service_name=foldered_name)
    assert new_master.startswith("master") and new_master != initial_master

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
示例#20
0
def test_master_reelection() -> None:
    initial_master = config.get_elasticsearch_master(service_name=service_name)
    sdk_cmd.kill_task_with_pattern(
        "master__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(service_name, initial_master)[0].host,
    )
    sdk_plan.wait_for_in_progress_recovery(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
    config.wait_for_expected_nodes_to_exist(service_name=service_name)
    new_master = config.get_elasticsearch_master(service_name=service_name)
    assert new_master.startswith("master") and new_master != initial_master

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
示例#21
0
def test_kill_scheduler():
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(
        config.SERVICE_NAME)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find one scheduler task"

    sdk_cmd.kill_task_with_pattern(
        "./hello-world-scheduler/bin/helloworld",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME),
    )

    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix,
                                  scheduler_ids)
    check_healthy()
示例#22
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(config.SERVICE_NAME)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find ONLY one scheduler task but found {}".format(scheduler_ids)

    sdk_cmd.kill_task_with_pattern(
        "./hello-world-scheduler/bin/helloworld",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME),
    )

    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.wait_for_active_framework(config.SERVICE_NAME)
    config.check_running()
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
示例#23
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(foldered_name, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find one scheduler task"

    sdk_cmd.kill_task_with_pattern(
        "./hdfs-scheduler/bin/hdfs",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(foldered_name),
    )

    # scheduler should be restarted, but service tasks should be left as-is:
    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids)
    config.check_healthy(service_name=foldered_name)
示例#24
0
def test_config_updates_then_all_executors_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
示例#25
0
def test_kill_essential():
    '''kill the essential task, verify that both tasks are relaunched against a matching executor'''
    verify_shared_executor('hello-0')

    old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0')
    assert len(old_ids) == 2

    sdk_cmd.kill_task_with_pattern(
        'shared-volume/essential', # hardcoded in cmd, see yml
        sdk_hosts.system_host(config.SERVICE_NAME, 'hello-0-essential'))

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', old_ids) # wait for ids to change...
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) # ...and for tasks to be up and running

    # the first verify_shared_executor call deleted the files. both should have come back via the relaunch.
    verify_shared_executor('hello-0', delete_files=False) # leave files as-is for the next test
def test_config_updates_then_all_executors_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_config_update_then_kill_all_task_in_node():
    #  kill both world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
示例#28
0
def test_config_update_then_kill_all_task_in_node():
    #  kill both world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_integrity_on_name_node_failure(hdfs_client):
    """
    The first name node (name-0-node) is the active name node by default when HDFS gets installed.
    This test checks that it is possible to write and read data after the active name node fails
    so as to verify a failover sustains expected functionality.
    """

    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000
    )
    def _get_active_name_node():
        for candidate in ("name-0-node", "name-1-node"):
            if is_name_node_active(candidate):
                return candidate
        raise Exception("Failed to determine active name node")

    active_name_node = _get_active_name_node()
    sdk_cmd.kill_task_with_pattern(
        "NameNode",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(config.SERVICE_NAME, active_name_node)[0].host,
    )

    # After the previous active namenode was killed, the opposite namenode should marked active:
    if active_name_node == "name-1-node":
        new_active_name_node = "name-0-node"
    else:
        new_active_name_node = "name-1-node"

    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000,
        retry_on_result=lambda res: not res,
    )
    def _wait_for_failover_to_complete(namenode):
        return is_name_node_active(namenode)

    _wait_for_failover_to_complete(new_active_name_node)

    test_filename = config.get_unique_filename("test_namenode_fail")

    config.hdfs_client_write_data(test_filename)
    config.hdfs_client_read_data(test_filename)

    config.check_healthy(config.SERVICE_NAME)
def test_integrity_on_data_node_failure(hdfs_client):
    """
    Verifies proper data replication among data nodes.
    """
    test_filename = config.get_unique_filename("test_datanode_fail")

    # An HDFS write will only successfully return when the data replication has taken place
    config.hdfs_client_write_data(test_filename)

    # Should have 3 data nodes (data-0,1,2), kill 2 of them:
    data_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "data")
    for idx in range(2):
        sdk_cmd.kill_task_with_pattern("DataNode", "nobody", agent_host=data_tasks[idx].host)

    config.hdfs_client_read_data(test_filename)

    config.check_healthy(config.SERVICE_NAME)
示例#31
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(foldered_name, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find one scheduler task"

    sdk_cmd.kill_task_with_pattern(
        "./hdfs-scheduler/bin/hdfs",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(foldered_name),
    )

    # scheduler should be restarted, but service tasks should be left as-is:
    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.wait_for_active_framework(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids)
    config.check_healthy(service_name=foldered_name)
示例#32
0
def test_losing_and_regaining_index_health(default_populated_index):
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name
    )
    sdk_cmd.kill_task_with_pattern(
        "data__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(foldered_name, "data-0-node")[0].host,
    )
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name
    )
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name
    )

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
示例#33
0
def test_kill_nonessential():
    '''kill the nonessential task, verify that the nonessential task is relaunched against the same executor as before'''
    verify_shared_executor('hello-0')

    old_essential_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-essential')
    assert len(old_essential_ids) == 1
    old_nonessential_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-nonessential')
    assert len(old_nonessential_ids) == 1

    sdk_cmd.kill_task_with_pattern(
        'shared-volume/nonessential', # hardcoded in cmd, see yml
        sdk_hosts.system_host(config.SERVICE_NAME, 'hello-0-nonessential'))

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0-nonessential', old_nonessential_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, 'hello-0-essential', old_essential_ids)

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor('hello-0', expected_files=['nonessential'])
def test_kill_agent():
    '''kill the agent task, verify that the agent task is relaunched against the same executor as before'''
    verify_shared_executor('hello-0')

    old_node_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-node')
    assert len(old_node_ids) == 1
    old_agent_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-agent')
    assert len(old_agent_ids) == 1

    sdk_cmd.kill_task_with_pattern(
        'agent-container-path/output',  # hardcoded in cmd, see yml
        sdk_hosts.system_host(config.SERVICE_NAME, 'hello-0-agent'))

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, 'hello-0-node', old_node_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0-agent', old_agent_ids)

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor('hello-0')
示例#35
0
def test_integrity_on_name_node_failure():
    """
    The first name node (name-0-node) is the active name node by default when HDFS gets installed.
    This test checks that it is possible to write and read data after the active name node fails
    so as to verify a failover sustains expected functionality.
    """
    active_name_node = config.get_active_name_node(config.SERVICE_NAME)
    sdk_cmd.kill_task_with_pattern("NameNode", sdk_hosts.system_host(config.SERVICE_NAME, active_name_node))

    predicted_active_name_node = "name-1-node"
    if active_name_node == "name-1-node":
        predicted_active_name_node = "name-0-node"

    wait_for_failover_to_complete(predicted_active_name_node)

    test_filename = get_unique_filename("test_namenode_fail")
    config.write_data_to_hdfs(config.SERVICE_NAME, test_filename)
    config.read_data_from_hdfs(config.SERVICE_NAME, test_filename)

    config.check_healthy(service_name=config.SERVICE_NAME)
def test_kill_agent():
    """kill the agent task, verify that the agent task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_node_task = [t for t in old_tasks if t.name == "hello-0-node"][0]
    old_agent_task = [t for t in old_tasks if t.name == "hello-0-agent"][0]

    sdk_cmd.kill_task_with_pattern(
        "agent-container-path/output",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_agent_task.host,
    )

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-node", [old_node_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-agent", [old_agent_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0")
def test_kill_agent():
    """kill the agent task, verify that the agent task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_node_task = [t for t in old_tasks if t.name == "hello-0-node"][0]
    old_agent_task = [t for t in old_tasks if t.name == "hello-0-agent"][0]

    sdk_cmd.kill_task_with_pattern(
        "agent-container-path/output",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_agent_task.host,
    )

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-node", [old_node_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-agent", [old_agent_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0")
示例#38
0
def test_kill_essential():
    """kill the essential task, verify that both tasks are relaunched against a matching executor"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2

    # kill the essential task process. both tasks are on the same pod, so same host:
    sdk_cmd.kill_task_with_pattern(
        "shared-volume/essential",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_tasks[0].host,
    )

    # wait for both task ids to change...
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [t.id for t in old_tasks])
    # ...and for tasks to be up and running
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    # the first verify_shared_executor call deleted the files. both should have come back via the relaunch.
    verify_shared_executor("hello-0", delete_files=False)  # leave files as-is for the next test
示例#39
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(
        config.SERVICE_NAME)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(
        scheduler_ids
    ) == 1, "Expected to find ONLY one scheduler task but found {}".format(
        scheduler_ids)

    sdk_cmd.kill_task_with_pattern(
        "./hello-world-scheduler/bin/helloworld",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME),
    )

    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix,
                                  scheduler_ids)
    sdk_tasks.wait_for_active_framework(config.SERVICE_NAME)
    config.check_running()
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
示例#40
0
def test_supervise_conflict_frameworkid():
    job_service_name = "MockTaskRunner"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class", "MockTaskRunner", "--conf",
        "spark.cores.max=1", "--conf", "spark.executors.cores=1"
    ]

    try:
        driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                     app_args="1 1800",
                                     service_name=utils.SPARK_SERVICE_NAME,
                                     args=job_args)
        log.info("Started supervised driver {}".format(driver_id))

        wait_job_present(True)
        log.info("Job has registered")

        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has running executors")

        service_info = shakedown.get_service(job_service_name).dict()
        driver_regex = "spark.mesos.driver.frameworkId={}".format(
            service_info['id'])
        kill_status = sdk_cmd.kill_task_with_pattern(driver_regex,
                                                     service_info['hostname'])

        wait_job_present(False)

        wait_job_present(True)
        log.info("Job has re-registered")
        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has re-started")

        restarted_service_info = shakedown.get_service(job_service_name).dict()
        assert service_info['id'] != restarted_service_info[
            'id'], "Job has restarted with same framework Id"
    finally:
        kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
        log.info("{}".format(kill_info))
        assert json.loads(kill_info)["success"], "Failed to kill spark job"
        wait_job_present(False)
示例#41
0
def test_kill_nonessential():
    """kill the nonessential task, verify that the nonessential task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_essential_task = [t for t in old_tasks if t.name == "hello-0-essential"][0]
    old_nonessential_task = [t for t in old_tasks if t.name == "hello-0-nonessential"][0]

    # kill the nonessential task process. both tasks are in the same pod, so same host:
    sdk_cmd.kill_task_with_pattern(
        "shared-volume/nonessential",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_nonessential_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-nonessential", [old_nonessential_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-essential", [old_essential_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0", expected_files=["nonessential"])
示例#42
0
def test_kill_agent():
    '''kill the agent task, verify that the agent task is relaunched against the same executor as before'''
    verify_shared_executor('hello-0')

    old_node_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-node')
    assert len(old_node_ids) == 1
    old_agent_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                           'hello-0-agent')
    assert len(old_agent_ids) == 1

    sdk_cmd.kill_task_with_pattern(
        'agent-container-path/output',  # hardcoded in cmd, see yml
        sdk_hosts.system_host(config.SERVICE_NAME, 'hello-0-agent'))

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, 'hello-0-node',
                                      old_node_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0-agent',
                                  old_agent_ids)

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor('hello-0')
示例#43
0
def test_config_update_then_zk_killed():
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello')
    config.bump_hello_cpus()
    sdk_cmd.kill_task_with_pattern('zookeeper')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', hello_ids)
    config.check_running()
示例#44
0
def test_config_update_then_master_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('mesos-master')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
示例#45
0
def test_zk_killed():
    sdk_cmd.kill_task_with_pattern('zookeeper')
    config.check_running()
def test_all_executors_killed():
    for host in shakedown.get_service_ips(config.SERVICE_NAME):
        sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', host)
    config.check_running()
def test_master_killed():
    sdk_cmd.kill_task_with_pattern('mesos-master')
    config.check_running()
def test_zk_killed():
    sdk_cmd.kill_task_with_pattern('zookeeper')
    config.check_running()
def test_config_update_then_zk_killed():
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello')
    config.bump_hello_cpus()
    sdk_cmd.kill_task_with_pattern('zookeeper')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', hello_ids)
    config.check_running()
def test_config_update_then_master_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('mesos-master')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_config_update_then_executor_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()