Exemplo n.º 1
0
def test_config_update_while_partitioned():
    world_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "world")
    partition_host = world_tasks[0].host

    sdk_agents.partition_agent(partition_host)

    service_config = sdk_marathon.get_config(config.SERVICE_NAME)
    updated_cpus = float(service_config["env"]["WORLD_CPUS"]) + 0.1
    service_config["env"]["WORLD_CPUS"] = str(updated_cpus)
    sdk_marathon.update_app(service_config,
                            wait_for_completed_deployment=False)

    sdk_agents.reconnect_agent(partition_host)

    # check that ALL the world tasks are updated after the agent reconnects:
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world",
                                  [t.id for t in world_tasks])
    check_healthy()
    all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)
    running_tasks = [
        t for t in all_tasks
        if t.name.startswith("world") and t.state == "TASK_RUNNING"
    ]
    assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME)
    for t in running_tasks:
        assert config.close_enough(t.resources["cpus"], updated_cpus)
def get_scheduler_host(service_name):
    task_prefix = get_scheduler_task_prefix(service_name)
    tasks = sdk_tasks.get_service_tasks("marathon", task_prefix=task_prefix)
    if len(tasks) == 0:
        raise Exception(
            "No marathon tasks starting with '{}' were found. Available tasks are: {}".format(
                task_prefix, [task["name"] for task in sdk_tasks.get_service_tasks("marathon")]
            )
        )
    return tasks.pop().host
Exemplo n.º 3
0
def test_scheduler_task_placement_by_marathon():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    try:
        # This test ensures that the placement of the scheduler task itself works as expected.
        some_private_agent = sdk_agents.get_private_agents().pop()["hostname"]
        logging.info("Constraining scheduler placement to [{}]".format(
            some_private_agent))
        sdk_install.install(
            config.PACKAGE_NAME,
            config.SERVICE_NAME,
            expected_running_tasks=1,
            additional_options={
                "service": {
                    "constraints":
                    [["hostname", "CLUSTER", "{}".format(some_private_agent)]],
                    "yaml":
                    "simple",
                }
            },
            wait_for_deployment=False,
        )
        summary = sdk_tasks.get_service_tasks("marathon", config.SERVICE_NAME)
        assert len(
            summary) == 1, "More than 1 task matched name [{}] : [{}]".format(
                config.SERVICE_NAME, summary)
        assert (some_private_agent == summary.pop().host
                ), "Scheduler task constraint placement failed by marathon"
    finally:
        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
Exemplo n.º 4
0
def test_unauthorized_users() -> None:
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "node-0")[0]
    _, stdout, stderr = sdk_cmd.run_cli(
        "task exec {} bash -c 'export JAVA_HOME=$(ls -d $MESOS_SANDBOX/jdk*/) ;  export PATH=$MESOS_SANDBOX/python-dist/bin:$PATH ; export PATH=$(ls -d $MESOS_SANDBOX/apache-cassandra-*/bin):$PATH ; cqlsh -u dcossuperuser -p wrongpassword -e \"SHOW VERSION\" node-0-server.$FRAMEWORK_HOST $CASSANDRA_NATIVE_TRANSPORT_PORT' ".format(
            tasks.id
        )
    )
Exemplo n.º 5
0
def get_hello_world_agent_sets():
    hello_agents = []
    world_agents = []
    for task in sdk_tasks.get_service_tasks(config.SERVICE_NAME):
        if task.name.startswith("hello-"):
            hello_agents.append(task.agent_id)
        elif task.name.startswith("world-"):
            world_agents.append(task.agent_id)
        else:
            assert False, "Unknown task: " + task.name
    return hello_agents, world_agents
Exemplo n.º 6
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Exemplo n.º 7
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Exemplo n.º 8
0
def test_kill_world_executor():
    world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="world-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "mesos-default-executor",
        "nobody",
        agent_host=world_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", [world_task.id])
    check_healthy()
Exemplo n.º 9
0
def test_kill_hello_task():
    hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "hello-container-path/output",
        "nobody",
        agent_host=hello_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [hello_task.id])
    check_healthy()
Exemplo n.º 10
0
def get_hello_world_agent_sets():
    hello_agents = []
    world_agents = []
    for task in sdk_tasks.get_service_tasks(config.SERVICE_NAME):
        if task.name.startswith("hello-"):
            hello_agents.append(task.agent_id)
        elif task.name.startswith("world-"):
            world_agents.append(task.agent_id)
        else:
            assert False, "Unknown task: " + task.name
    return hello_agents, world_agents
Exemplo n.º 11
0
def test_kill_all_executors():
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    for task in tasks:
        sdk_cmd.kill_task_with_pattern(
            "mesos-default-executor",
            "nobody",
            agent_host=task.host,
        )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "", [task.id for task in tasks])
    check_healthy()
Exemplo n.º 12
0
def test_losing_and_regaining_index_health(default_populated_index: None) -> None:
    config.check_elasticsearch_index_health(index_name, "green", service_name=service_name)
    sdk_cmd.kill_task_with_pattern(
        "data__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(service_name, "data-0-node")[0].host,
    )
    config.check_elasticsearch_index_health(index_name, "yellow", service_name=service_name)
    config.check_elasticsearch_index_health(index_name, "green", service_name=service_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
Exemplo n.º 13
0
def test_kill_all_executors():
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    for task in tasks:
        sdk_cmd.kill_task_with_pattern(
            "mesos-default-executor",
            "nobody",
            agent_host=task.host,
        )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "",
                                  [task.id for task in tasks])
    check_healthy()
Exemplo n.º 14
0
def test_kill_world_executor():
    world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME,
                                             task_prefix="world-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "mesos-default-executor",
        "nobody",
        agent_host=world_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0",
                                  [world_task.id])
    check_healthy()
Exemplo n.º 15
0
def test_kill_hello_task():
    hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME,
                                             task_prefix="hello-0")[0]

    sdk_cmd.kill_task_with_pattern(
        "hello-container-path/output",
        "nobody",
        agent_host=hello_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0",
                                  [hello_task.id])
    check_healthy()
Exemplo n.º 16
0
def test_config_update_while_partitioned():
    world_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "world")
    partition_host = world_tasks[0].host

    sdk_agents.partition_agent(partition_host)

    service_config = sdk_marathon.get_config(config.SERVICE_NAME)
    updated_cpus = float(service_config["env"]["WORLD_CPUS"]) + 0.1
    service_config["env"]["WORLD_CPUS"] = str(updated_cpus)
    sdk_marathon.update_app(service_config, wait_for_completed_deployment=False)

    sdk_agents.reconnect_agent(partition_host)

    # check that ALL the world tasks are updated after the agent reconnects:
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world", [t.id for t in world_tasks])
    check_healthy()
    all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)
    running_tasks = [
        t for t in all_tasks if t.name.startswith("world") and t.state == "TASK_RUNNING"
    ]
    assert len(running_tasks) == config.world_task_count(config.SERVICE_NAME)
    for t in running_tasks:
        assert config.close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 17
0
def test_bump_hello_cpus():
    hello_ids = sdk_tasks.get_task_ids(foldered_name, "hello")
    log.info("hello ids: " + str(hello_ids))

    updated_cpus = config.bump_hello_cpus(foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "hello", hello_ids)
    sdk_plan.wait_for_completed_deployment(foldered_name)

    all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="hello")
    running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"]
    assert len(running_tasks) == config.hello_task_count(foldered_name)
    for t in running_tasks:
        assert config.close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 18
0
def test_bump_world_cpus():
    original_world_ids = sdk_tasks.get_task_ids(foldered_name, "world")
    log.info("world ids: " + str(original_world_ids))

    updated_cpus = config.bump_world_cpus(foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "world", original_world_ids)
    sdk_plan.wait_for_completed_deployment(foldered_name)

    all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="world")
    running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"]
    assert len(running_tasks) == config.world_task_count(foldered_name)
    for t in running_tasks:
        assert config.close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 19
0
def test_master_reelection():
    initial_master = config.get_elasticsearch_master(service_name=foldered_name)
    sdk_cmd.kill_task_with_pattern(
        "master__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(foldered_name, initial_master)[0].host,
    )
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
    new_master = config.get_elasticsearch_master(service_name=foldered_name)
    assert new_master.startswith("master") and new_master != initial_master

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Exemplo n.º 20
0
def test_master_reelection() -> None:
    initial_master = config.get_elasticsearch_master(service_name=service_name)
    sdk_cmd.kill_task_with_pattern(
        "master__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(service_name, initial_master)[0].host,
    )
    sdk_plan.wait_for_in_progress_recovery(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
    config.wait_for_expected_nodes_to_exist(service_name=service_name)
    new_master = config.get_elasticsearch_master(service_name=service_name)
    assert new_master.startswith("master") and new_master != initial_master

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
Exemplo n.º 21
0
def test_bump_hello_cpus():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)
    hello_ids = sdk_tasks.get_task_ids(foldered_name, "hello")
    log.info("hello ids: " + str(hello_ids))

    updated_cpus = config.bump_hello_cpus(foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "hello", hello_ids)
    config.check_running(foldered_name)

    all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="hello")
    running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"]
    assert len(running_tasks) == config.hello_task_count(foldered_name)
    for t in running_tasks:
        assert config.close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 22
0
def test_integrity_on_data_node_failure(hdfs_client):
    """
    Verifies proper data replication among data nodes.
    """
    test_filename = config.get_unique_filename("test_datanode_fail")

    # An HDFS write will only successfully return when the data replication has taken place
    config.hdfs_client_write_data(test_filename)

    # Should have 3 data nodes (data-0,1,2), kill 2 of them:
    data_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "data")
    for idx in range(2):
        sdk_cmd.kill_task_with_pattern("DataNode", "nobody", agent_host=data_tasks[idx].host)

    config.hdfs_client_read_data(test_filename)

    config.check_healthy(config.SERVICE_NAME)
Exemplo n.º 23
0
def test_integrity_on_name_node_failure(hdfs_client):
    """
    The first name node (name-0-node) is the active name node by default when HDFS gets installed.
    This test checks that it is possible to write and read data after the active name node fails
    so as to verify a failover sustains expected functionality.
    """

    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000
    )
    def _get_active_name_node():
        for candidate in ("name-0-node", "name-1-node"):
            if is_name_node_active(candidate):
                return candidate
        raise Exception("Failed to determine active name node")

    active_name_node = _get_active_name_node()
    sdk_cmd.kill_task_with_pattern(
        "NameNode",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(config.SERVICE_NAME, active_name_node)[0].host,
    )

    # After the previous active namenode was killed, the opposite namenode should marked active:
    if active_name_node == "name-1-node":
        new_active_name_node = "name-0-node"
    else:
        new_active_name_node = "name-1-node"

    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=config.DEFAULT_HDFS_TIMEOUT * 1000,
        retry_on_result=lambda res: not res,
    )
    def _wait_for_failover_to_complete(namenode):
        return is_name_node_active(namenode)

    _wait_for_failover_to_complete(new_active_name_node)

    test_filename = config.get_unique_filename("test_namenode_fail")

    config.hdfs_client_write_data(test_filename)
    config.hdfs_client_read_data(test_filename)

    config.check_healthy(config.SERVICE_NAME)
Exemplo n.º 24
0
def test_losing_and_regaining_index_health(default_populated_index):
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name
    )
    sdk_cmd.kill_task_with_pattern(
        "data__.*Elasticsearch",
        "nobody",
        agent_host=sdk_tasks.get_service_tasks(foldered_name, "data-0-node")[0].host,
    )
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name
    )
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name
    )

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Exemplo n.º 25
0
def test_bump_hello_cpus():
    def close_enough(val0, val1):
        epsilon = 0.00001
        diff = abs(val0 - val1)
        return diff < epsilon

    config.check_running(config.SERVICE_NAME)
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello")
    log.info("hello ids: " + str(hello_ids))

    updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME)

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", hello_ids)
    config.check_running(config.SERVICE_NAME)

    all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello")
    running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"]
    for t in running_tasks:
        assert close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 26
0
def test_bump_hello_cpus():
    def close_enough(val0, val1):
        epsilon = 0.00001
        diff = abs(val0 - val1)
        return diff < epsilon

    config.check_running(config.SERVICE_NAME)
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello")
    log.info("hello ids: " + str(hello_ids))

    updated_cpus = config.bump_hello_cpus(config.SERVICE_NAME)

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", hello_ids)
    config.check_running(config.SERVICE_NAME)

    all_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME,
                                            task_prefix="hello")
    running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"]
    for t in running_tasks:
        assert close_enough(t.resources["cpus"], updated_cpus)
Exemplo n.º 27
0
def test_kill_essential():
    """kill the essential task, verify that both tasks are relaunched against a matching executor"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2

    # kill the essential task process. both tasks are on the same pod, so same host:
    sdk_cmd.kill_task_with_pattern(
        "shared-volume/essential",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_tasks[0].host,
    )

    # wait for both task ids to change...
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [t.id for t in old_tasks])
    # ...and for tasks to be up and running
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    # the first verify_shared_executor call deleted the files. both should have come back via the relaunch.
    verify_shared_executor("hello-0", delete_files=False)  # leave files as-is for the next test
Exemplo n.º 28
0
def test_kill_agent():
    """kill the agent task, verify that the agent task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_node_task = [t for t in old_tasks if t.name == "hello-0-node"][0]
    old_agent_task = [t for t in old_tasks if t.name == "hello-0-agent"][0]

    sdk_cmd.kill_task_with_pattern(
        "agent-container-path/output",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_agent_task.host,
    )

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-node", [old_node_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-agent", [old_agent_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0")
Exemplo n.º 29
0
def test_kill_agent():
    """kill the agent task, verify that the agent task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_node_task = [t for t in old_tasks if t.name == "hello-0-node"][0]
    old_agent_task = [t for t in old_tasks if t.name == "hello-0-agent"][0]

    sdk_cmd.kill_task_with_pattern(
        "agent-container-path/output",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_agent_task.host,
    )

    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-node", [old_node_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-agent", [old_agent_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0")
Exemplo n.º 30
0
def test_custom_jmx_port():
    expected_open_port = ":7200 (LISTEN)"

    new_config = {"cassandra": {"jmx_port": 7200}}

    sdk_service.update_configuration(
        config.PACKAGE_NAME,
        config.get_foldered_service_name(),
        new_config,
        config.DEFAULT_TASK_COUNT,
    )

    sdk_plan.wait_for_completed_deployment(config.get_foldered_service_name())

    tasks = sdk_tasks.get_service_tasks(config.get_foldered_service_name(),
                                        "node")

    for task in tasks:
        _, stdout, _ = sdk_cmd.run_cli("task exec {} lsof -i :7200".format(
            task.id))
        assert expected_open_port in stdout
Exemplo n.º 31
0
def test_kill_nonessential():
    """kill the nonessential task, verify that the nonessential task is relaunched against the same executor as before"""
    verify_shared_executor("hello-0")

    old_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "hello-0")
    assert len(old_tasks) == 2
    old_essential_task = [t for t in old_tasks if t.name == "hello-0-essential"][0]
    old_nonessential_task = [t for t in old_tasks if t.name == "hello-0-nonessential"][0]

    # kill the nonessential task process. both tasks are in the same pod, so same host:
    sdk_cmd.kill_task_with_pattern(
        "shared-volume/nonessential",  # hardcoded in cmd, see yml
        "nobody",
        agent_host=old_nonessential_task.host,
    )

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-nonessential", [old_nonessential_task.id])
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "hello-0-essential", [old_essential_task.id])

    # the first verify_shared_executor call deleted the files. only the nonessential file came back via its relaunch.
    verify_shared_executor("hello-0", expected_files=["nonessential"])
Exemplo n.º 32
0
def test_scheduler_task_placement_by_marathon():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    try:
        # This test ensures that the placement of the scheduler task itself works as expected.
        some_private_agent = sdk_agents.get_private_agents().pop()["hostname"]
        logging.info("Constraining scheduler placement to [{}]".format(some_private_agent))
        sdk_install.install(
            config.PACKAGE_NAME,
            config.SERVICE_NAME,
            expected_running_tasks=1,
            additional_options={
                "service": {
                    "constraints": [["hostname", "CLUSTER", "{}".format(some_private_agent)]],
                    "yaml": "simple"
                }
            },
            wait_for_deployment=False,
        )
        summary = sdk_tasks.get_service_tasks("marathon", config.SERVICE_NAME)
        assert len(summary) == 1, "More than 1 task matched name [{}] : [{}]".format(config.SERVICE_NAME, summary)
        assert some_private_agent == summary.pop().host, "Scheduler task constraint placement failed by marathon"
    finally:
        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
Exemplo n.º 33
0
def test_unauthorized_users() -> None:
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME, "node-0")[0]
    _, stdout, stderr = sdk_cmd.run_cli("task exec {} bash -c 'export JAVA_HOME=$(ls -d $MESOS_SANDBOX/jdk*/jre/) ;  export PATH=$MESOS_SANDBOX/python-dist/bin:$PATH ; export PATH=$(ls -d $MESOS_SANDBOX/apache-cassandra-*/bin):$PATH ; cqlsh -u dcossuperuser -p wrongpassword -e \"SHOW VERSION\" node-0-server.$FRAMEWORK_HOST $CASSANDRA_NATIVE_TRANSPORT_PORT' ".format(tasks.id))
    assert "Provided username dcossuperuser and/or password are incorrect" in stderr
def test_xmx_and_xms_flags(configure_security):
    """ method to test the duplication of JVM flags in elastic tasks """

    # setting custom values for the heap of various pods
    MASTER_NODE_HEAP = 700
    DATA_NODE_HEAP = 800
    COORDINATOR_NODE_HEAP = 900
    INGEST_NODE_HEAP = 1000

    # installing elastic service and passing customized json to overwrite default values.
    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        config.DEFAULT_TASK_COUNT,
        {
            "master_nodes": {"heap": {"size": MASTER_NODE_HEAP}},
            "data_nodes": {"heap": {"size": DATA_NODE_HEAP}},
            "coordinator_nodes": {"heap": {"size": COORDINATOR_NODE_HEAP}},
            "ingest_nodes": {"heap": {"size": INGEST_NODE_HEAP}},
        },
    )
    # getting all the tasks and checking the flag duplicacy by running curl_cmd command.
    for task in sdk_tasks.get_service_tasks(config.SERVICE_NAME):
        cmd = "ps aux"
        flag_xms = "Xms"
        flag_xmx = "Xmx"
        exit_code, stdout, stderr = sdk_cmd.service_task_exec(config.SERVICE_NAME, task.name, cmd)

        if not str(task).count("exporter"):
            assert str(stdout).count(flag_xms) == 1, "Default xms flag prefix should appear once"
            assert str(stdout).count(flag_xmx) == 1, "Default xmx flag prefix should appear once"

        if str(task).count("master"):
            master_xms = flag_xms + str(MASTER_NODE_HEAP)
            master_xmx = flag_xmx + str(MASTER_NODE_HEAP)
            log.info("Checking flags in master node: " + task.name)
            assert (
                str(stdout).count(master_xms) == 1
            ), "Configured master node xms flag prefix should appear once"
            assert (
                str(stdout).count(master_xmx) == 1
            ), "Configured master node xmx flag prefix should appear once"

        if str(task).count("data"):
            data_xms = flag_xms + str(DATA_NODE_HEAP)
            data_xmx = flag_xmx + str(DATA_NODE_HEAP)
            log.info("Checking flags in data node: " + task.name)
            assert (
                str(stdout).count(data_xms) == 1
            ), "Configured data node xms flag prefix should appear once"
            assert (
                str(stdout).count(data_xmx) == 1
            ), "Configured data node xmx flag prefix should appear once"

        if str(task).count("coordinator"):
            coordinator_xms = flag_xms + str(COORDINATOR_NODE_HEAP)
            coordinator_xmx = flag_xmx + str(COORDINATOR_NODE_HEAP)
            log.info("Checking flags in coordinator node: " + task.name)
            assert (
                str(stdout).count(coordinator_xms) == 1
            ), "Configured coordinator node xms flag prefix should appear once"
            assert (
                str(stdout).count(coordinator_xmx) == 1
            ), "Configured coordinator node xmx flag prefix should appear once"

        if str(task).count("ingest"):
            ingest_xms = flag_xms + str(INGEST_NODE_HEAP)
            ingest_xmx = flag_xmx + str(INGEST_NODE_HEAP)
            log.info("Checking flags in ingest node: " + task.name)
            assert (
                str(stdout).count(ingest_xms) == 1
            ), "Configured ingest node flag xms prefix should appear once"
            assert (
                str(stdout).count(ingest_xmx) == 1
            ), "Configured ingest node flag xmx prefix should appear once"
Exemplo n.º 35
0
def test_overlay_network():
    """Verify that the current deploy plan matches the expected plan from the spec."""

    deployment_plan = sdk_plan.wait_for_completed_deployment(
        config.SERVICE_NAME)
    log.info(sdk_plan.plan_string("deploy", deployment_plan))

    # test that the tasks are all up, which tests the overlay DNS
    framework_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    expected_running_tasks = [
        "overlay-vip-0-server",
        "overlay-0-server",
        "host-vip-0-server",
        "host-0-server",
    ]
    assert set(expected_running_tasks) == set(
        [t.name for t in framework_tasks])

    for task in framework_tasks:
        name = task.name
        if name.startswith("host-"):
            assert "ports" in task.resources.keys(
            ), "Task {} should have port resources".format(name)
            sdk_networks.check_task_network(name, expected_network_name=None)
        elif name.startswith("overlay-"):
            assert ("ports" not in task.resources.keys()
                    ), "Task {} should NOT have port resources".format(name)
            sdk_networks.check_task_network(name)
        else:
            assert False, "Unknown task {}".format(name)

    endpoints_result = sdk_networks.get_endpoint_names(config.PACKAGE_NAME,
                                                       config.SERVICE_NAME)
    assert len(endpoints_result) == 2, "Expected 2 endpoints, got: {}".format(
        endpoints_result)

    overlay_endpoints_result = sdk_networks.get_endpoint(
        config.PACKAGE_NAME, config.SERVICE_NAME, "overlay-vip")
    assert ("address" in overlay_endpoints_result.keys()
            ), "overlay endpoints missing 'address': {}".format(
                overlay_endpoints_result)
    assert len(overlay_endpoints_result["address"]) == 1
    assert overlay_endpoints_result["address"][0].startswith("9")
    overlay_port = overlay_endpoints_result["address"][0].split(":")[-1]
    assert overlay_port == "4044"
    assert "dns" in overlay_endpoints_result.keys()
    assert len(overlay_endpoints_result["dns"]) == 1
    assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "overlay-vip-0-server", 4044)

    host_endpoints_result = sdk_networks.get_endpoint(config.PACKAGE_NAME,
                                                      config.SERVICE_NAME,
                                                      "host-vip")
    assert (
        "address" in host_endpoints_result.keys()
    ), "overlay endpoints missing 'address'" "{}".format(host_endpoints_result)
    assert len(host_endpoints_result["address"]) == 1
    assert host_endpoints_result["address"][0].startswith("10")
    host_port = host_endpoints_result["address"][0].split(":")[-1]
    assert host_port == "4044"
    assert "dns" in host_endpoints_result.keys()
    assert len(host_endpoints_result["dns"]) == 1
    assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "host-vip-0-server", 4044)
Exemplo n.º 36
0
def check_health_check_logs():
    broker = None
    executor_path = None
    health_check = {"success": 0, "faliure": 0}

    # Get list of all tasks for kafka service
    tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    # Get any 0th broker
    for t in tasks:
        if t.state == "TASK_RUNNING" and t.name == "kafka-0-broker":
            broker = t
            break
    if not broker:
        raise Exception("Broker 0 is not in status: TASK_RUNNING!")

    # Get Mesos Executor path for Broker 0
    cluster_tasks = sdk_cmd.cluster_request("GET", "/mesos/tasks").json()
    agent_executor_paths = sdk_cmd.cluster_request(
        "GET", "/slave/{}/files/debug".format(broker.agent_id)
    ).json()
    for cluster_task in cluster_tasks["tasks"]:
        if cluster_task["slave_id"] == broker.agent_id:
            executor_path = sdk_diag._find_matching_executor_path(
                agent_executor_paths, sdk_diag._TaskEntry(cluster_task)
            )
            break
    if not executor_path:
        raise Exception("Executor path not found!")

    # Get path of executor's stderr file
    file_infos = sdk_cmd.cluster_request(
        "GET", "/slave/{}/files/browse?path={}".format(broker.agent_id, executor_path)
    ).json()
    file_info = None
    for _t in file_infos:
        if _t["path"].endswith("/stderr"):
            file_info = _t
            break
    if not file_info:
        raise Exception("Executor stderr file not found!")

    # Download stderr file
    stderr_log = None
    try:
        stderr_log = tempfile.NamedTemporaryFile(mode="wb", delete=False)
        stream = sdk_cmd.cluster_request(
            "GET",
            "/slave/{}/files/download?path={}".format(broker.agent_id, file_info["path"])
        )
        with stderr_log as f:
            for chunk in stream.iter_content(chunk_size=8192):
                f.write(chunk)
    except Exception:
        log.exception(
            "Failed to get file: {} from agent: {}".format(file_info["path"], broker.agent_id))

    # Check stderr for health check output
    try:
        with open(stderr_log.name) as f:
            for line in f:
                if re.match("^(Health check passed)", line):
                    health_check["success"] += 1
                if re.match("^(Health check failed)", line):
                    health_check["faliure"] += 1
    except Exception:
        log.exception("Failed to read downloaded executor stderr file: {} from agent: {}".format(file_info["path"], broker.agent_id))

    log.info("HEALTH CHECK success:{} faliure:{}".format(health_check["success"], health_check["faliure"]))
    assert health_check["success"] > health_check["faliure"]
Exemplo n.º 37
0
def get_metrics(package_name: str, service_name: str, pod_name: str, task_name: str) -> List:
    """Return a list of DC/OS metrics datapoints.

    Keyword arguments:
    package_name -- the name of the package the service is using
    service_name -- the name of the service to get metrics for
    task_name -- the name of the task whose agent to run metrics commands from
    """

    # Find task entry in mesos state:
    tasks = sdk_tasks.get_service_tasks(service_name)
    for task in tasks:
        if task.name == task_name:
            task_to_check = task
            break
    if task_to_check is None:
        raise Exception(
            "Task named {} not found in service {}: {}".format(task_name, service_name, tasks)
        )

    # Find task's container id via recent TaskStatus:
    rc, stdout, _ = sdk_cmd.svc_cli(
        package_name, service_name, "pod info {}".format(pod_name), print_output=False
    )
    assert rc == 0, "Pod info failed"
    pod_info = json.loads(stdout)
    task_container_id = None
    for task in pod_info:
        if task["info"]["name"] == task_name:
            task_container_id = task["status"]["containerStatus"]["containerId"]["value"]
            break
    if task_container_id is None:
        log.warning("Task named {} not found in pod {}: {}".format(task_name, pod_name, pod_info))
        return []

    # Not related to functionality, but consuming this endpoint to verify metrics integrity
    containers_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers".format(task_to_check.agent_id),
        retry=False,
    )
    reported_container_ids = json.loads(containers_response.text)

    container_id_reported = False
    for container_id in reported_container_ids:
        if container_id == task_container_id:
            container_id_reported = True
            break
    if not container_id_reported:
        raise ValueError(
            "The metrics /container endpoint returned {} for agent {}, expected {} to be returned as well".format(
                reported_container_ids, task_to_check.agent_id, task_container_id
            )
        )

    app_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(
            task_to_check.agent_id, task_container_id
        ),
        retry=False,
    )
    app_response.raise_for_status()
    app_json = app_response.json()

    if "dimensions" not in app_json:
        log.error("Expected key '%s' not found in app metrics: %s", "dimensions", app_json)
        raise Exception("Expected key 'dimensions' not found in app metrics")

    if "task_name" not in app_json["dimensions"]:
        log.error(
            "Expected key '%s' not found in app metrics: %s", "dimensions.task_name", app_json
        )
        raise Exception("Expected key 'dimensions.task_name' not found in app metrics")

    if app_json["dimensions"]["task_name"] == task_name:
        return list(app_json["datapoints"])

    raise Exception("No metrics found for task {} in service {}".format(task_name, service_name))
Exemplo n.º 38
0
def test_overlay_network():
    """Verify that the current deploy plan matches the expected plan from the spec."""

    deployment_plan = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    log.info(sdk_plan.plan_string("deploy", deployment_plan))

    # test that the tasks are all up, which tests the overlay DNS
    framework_tasks = sdk_tasks.get_service_tasks(config.SERVICE_NAME)

    expected_running_tasks = [
        "overlay-vip-0-server",
        "overlay-0-server",
        "host-vip-0-server",
        "host-0-server"
    ]
    assert set(expected_running_tasks) == set([t.name for t in framework_tasks])

    for task in framework_tasks:
        name = task.name
        if name.startswith("host-"):
            assert "ports" in task.resources.keys(), "Task {} should have port resources".format(
                name
            )
            sdk_networks.check_task_network(name, expected_network_name=None)
        elif name.startswith("overlay-"):
            assert (
                "ports" not in task.resources.keys()
            ), "Task {} should NOT have port resources".format(
                name
            )
            sdk_networks.check_task_network(name)
        else:
            assert False, "Unknown task {}".format(name)

    endpoints_result = sdk_networks.get_endpoint_names(config.PACKAGE_NAME, config.SERVICE_NAME)
    assert len(endpoints_result) == 2, "Expected 2 endpoints, got: {}".format(endpoints_result)

    overlay_endpoints_result = sdk_networks.get_endpoint(
        config.PACKAGE_NAME, config.SERVICE_NAME, "overlay-vip"
    )
    assert "address" in overlay_endpoints_result.keys(), (
        "overlay endpoints missing 'address': {}".format(overlay_endpoints_result)
    )
    assert len(overlay_endpoints_result["address"]) == 1
    assert overlay_endpoints_result["address"][0].startswith("9")
    overlay_port = overlay_endpoints_result["address"][0].split(":")[-1]
    assert overlay_port == "4044"
    assert "dns" in overlay_endpoints_result.keys()
    assert len(overlay_endpoints_result["dns"]) == 1
    assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "overlay-vip-0-server", 4044
    )

    host_endpoints_result = sdk_networks.get_endpoint(
        config.PACKAGE_NAME, config.SERVICE_NAME, "host-vip"
    )
    assert "address" in host_endpoints_result.keys(), (
        "overlay endpoints missing 'address'" "{}".format(host_endpoints_result)
    )
    assert len(host_endpoints_result["address"]) == 1
    assert host_endpoints_result["address"][0].startswith("10")
    host_port = host_endpoints_result["address"][0].split(":")[-1]
    assert host_port == "4044"
    assert "dns" in host_endpoints_result.keys()
    assert len(host_endpoints_result["dns"]) == 1
    assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "host-vip-0-server", 4044
    )