Exemplo n.º 1
0
def test_master_node_replace() -> None:
    # Ideally, the pod will get placed on a different agent. This test will verify that the
    # remaining two masters find the replaced master at its new IP address. This requires a
    # reasonably low TTL for Java DNS lookups.
    sdk_cmd.svc_cli(package_name, service_name, "pod replace master-0")
    sdk_plan.wait_for_in_progress_recovery(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)
def client_can_read_and_write(test_id: str,
                              kafka_client: dict, kafka_server: dict,
                              endpoint_name: str, krb5: object=None) -> tuple:
    client_id = kafka_client["id"]

    brokers_list = service_get_brokers(kafka_server, endpoint_name)
    broker_hosts = map(lambda b: b.split(":")[0], brokers_list)
    brokers = ",".join(brokers_list)

    if not sdk_cmd.resolve_hosts(kafka_client["id"], broker_hosts):
        log.error("Failed to resolve brokers: %s", broker_hosts)
        return False, []

    topic_name = kafka_client["env"]["KAFKA_TOPIC"]
    sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"],
                    "topic create {}".format(topic_name),
                    json=True)

    test_utils.wait_for_topic(kafka_server["package_name"], kafka_server["service"]["name"], topic_name)

    message = str(uuid.uuid4())

    security_options = {"is-tls": endpoint_name == "broker-tls",
                        "kerberos": krb5}

    write_success = write_to_topic(test_id, client_id, topic_name, message, brokers, security_options)
    if write_success:
        MESSAGES.append(message)

    read_messages = read_from_topic(test_id, client_id, topic_name, len(MESSAGES), brokers, security_options)

    read_success = map(lambda m: m in read_messages, MESSAGES)

    return write_success, read_success
def test_zones_referenced_in_placement_constraints():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)

    sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
    sdk_install.install(
        config.PACKAGE_NAME,
        foldered_name,
        config.DEFAULT_BROKER_COUNT,
        additional_options={
            "service": {
                "name": foldered_name,
                "placement_constraint": "[[\"@zone\", \"GROUP_BY\"]]"
            }
        })

    test_utils.broker_count_check(
        config.DEFAULT_BROKER_COUNT, service_name=foldered_name)

    broker_ids = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, 'broker list', json=True)

    for broker_id in broker_ids:
        broker_info = sdk_cmd.svc_cli(
            config.PACKAGE_NAME,
            foldered_name,
            'broker get {}'.format(broker_id),
            json=True)

        assert sdk_fault_domain.is_valid_zone(broker_info.get('rack'))

    sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
def test_canary_third():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "plan continue deploy hello-deploy")

    expected_tasks = ["hello-0", "hello-1", "hello-2", "hello-3", "world-0"]
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod list")
    assert rc == 0, "Pod list failed"
    assert json.loads(stdout) == expected_tasks

    pl = sdk_plan.wait_for_completed_phase(config.SERVICE_NAME, "deploy", "hello-deploy")
    log.info(pl)

    assert pl["status"] == "WAITING"

    assert len(pl["phases"]) == 2

    phase = pl["phases"][0]
    assert phase["status"] == "COMPLETE"
    steps = phase["steps"]
    assert len(steps) == 4
    assert steps[0]["status"] == "COMPLETE"
    assert steps[1]["status"] == "COMPLETE"
    assert steps[2]["status"] == "COMPLETE"
    assert steps[3]["status"] == "COMPLETE"

    phase = pl["phases"][1]
    assert phase["status"] == "WAITING"
    steps = phase["steps"]
    assert len(steps) == 4
    assert steps[0]["status"] == "COMPLETE"
    assert steps[1]["status"] == "WAITING"
    assert steps[2]["status"] == "PENDING"
    assert steps[3]["status"] == "PENDING"
Exemplo n.º 5
0
def test_topic_offsets_increase_with_writes():
    offset_info = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic offsets --time="-1" {}'.format(config.DEFAULT_TOPIC_NAME), json=True)
    assert len(offset_info) == config.DEFAULT_PARTITION_COUNT

    offsets = {}
    for o in offset_info:
        assert len(o) == config.DEFAULT_REPLICATION_FACTOR
        offsets.update(o)

    assert len(offsets) == config.DEFAULT_PARTITION_COUNT

    num_messages = 10
    write_info = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic producer_test {} {}'.format(config.DEFAULT_TOPIC_NAME, num_messages), json=True)
    assert len(write_info) == 1
    assert write_info['message'].startswith(
        'Output: {} records sent'.format(num_messages))

    offset_info = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic offsets --time="-1" {}'.format(config.DEFAULT_TOPIC_NAME), json=True)
    assert len(offset_info) == config.DEFAULT_PARTITION_COUNT

    post_write_offsets = {}
    for offsets in offset_info:
        assert len(o) == config.DEFAULT_REPLICATION_FACTOR
        post_write_offsets.update(o)

    assert not offsets == post_write_offsets
Exemplo n.º 6
0
def test_plan_cli():
    plan_name = "deploy"
    phase_name = "world"
    _check_json_output(foldered_name, "plan list")
    rc, _, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "plan show {}".format(plan_name))
    assert rc == 0
    _check_json_output(foldered_name, "plan show --json {}".format(plan_name))
    _check_json_output(foldered_name, "plan show {} --json".format(plan_name))

    # trigger a restart so that the plan is in a non-complete state.
    # the 'interrupt' command will fail if the plan is already complete:
    rc, _, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, "plan force-restart {}".format(plan_name)
    )
    assert rc == 0
    rc, _, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, "plan interrupt {} {}".format(plan_name, phase_name)
    )
    assert rc == 0
    rc, _, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, "plan continue {} {}".format(plan_name, phase_name)
    )
    assert rc == 0

    # now wait for plan to finish before continuing to other tests:
    assert sdk_plan.wait_for_completed_plan(foldered_name, plan_name)
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Exemplo n.º 8
0
def test_hostname_unique():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    options = _escape_placement_for_1_9(
        {
            "service": {"yaml": "marathon_constraint"},
            "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'},
            "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'},
        }
    )

    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        get_num_private_agents() * 2,
        additional_options=options,
    )

    # hello deploys first. One "world" task should end up placed with each "hello" task.
    # ensure "hello" task can still be placed with "world" task
    old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0")
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0")
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    sdk_tasks.check_running(
        config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10
    )
    sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2)
    ensure_count_per_agent(hello_count=1, world_count=1)
Exemplo n.º 9
0
def test_secrets_basic():
    # 1) create Secrets
    # 2) install examples/secrets.yml
    # 3) if secret file is not created, tasks will fail
    # 4) wait till deployment finishes
    # 5) do replace operation
    # 6) ensure all tasks are running
    # 7) delete Secrets

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options)

    hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server")
    world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server")

    # ensure that secrets work after replace
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0')

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0)

    # tasks will fail if secret files are not created by mesos module
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # clean up and delete secrets
    delete_secrets("{}/".format(config.SERVICE_NAME))
Exemplo n.º 10
0
def test_port_static_to_dynamic_port():
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE))

    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['BROKER_PORT'] = '0'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids)
    # all tasks are running
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    for broker_id in range(config.DEFAULT_BROKER_COUNT):
        result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True)
        assert result['port'] != 9092

    result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True)
    assert len(result['address']) == config.DEFAULT_BROKER_COUNT
    assert len(result['dns']) == config.DEFAULT_BROKER_COUNT

    for port in result['address']:
        assert int(port.split(':')[-1]) != 9092

    for port in result['dns']:
        assert int(port.split(':')[-1]) != 9092
Exemplo n.º 11
0
def test_endpoints():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    # check that we can reach the scheduler via admin router, and that returned endpoints are sanitized:
    core_site = etree.fromstring(sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints core-site.xml'))
    check_properties(core_site, {
        'ha.zookeeper.parent-znode': '/{}/hadoop-ha'.format(sdk_utils.get_zk_path(
            foldered_name))
    })

    hdfs_site = etree.fromstring(sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints hdfs-site.xml'))
    expect = {
        'dfs.namenode.shared.edits.dir': 'qjournal://{}/hdfs'.format(';'.join([
            sdk_hosts.autoip_host(
                foldered_name,
                'journal-{}-node'.format(i),
                8485
            ) for i in range(3)])),
    }
    for i in range(2):
        name_node = 'name-{}-node'.format(i)
        expect['dfs.namenode.rpc-address.hdfs.{}'.format(name_node)] = sdk_hosts.autoip_host(
            foldered_name, name_node, 9001)
        expect['dfs.namenode.http-address.hdfs.{}'.format(name_node)] = sdk_hosts.autoip_host(
            foldered_name, name_node, 9002)
    check_properties(hdfs_site, expect)
Exemplo n.º 12
0
def test_custom_zookeeper():
    broker_ids = sdk_tasks.get_task_ids(
        FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE))

    # create a topic against the default zk:
    sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True)
    assert sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic list', json=True) == [config.DEFAULT_TOPIC_NAME]

    marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME)
    # should be using default path when this envvar is empty/unset:
    assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == ''

    # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall:
    zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(ZK_SERVICE_PATH)
    marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path
    sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config)

    sdk_tasks.check_tasks_updated(
        FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids)
    sdk_plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME)

    # wait for brokers to finish registering
    test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=FOLDERED_SERVICE_NAME)

    zookeeper = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'endpoints zookeeper')
    assert zookeeper.rstrip('\n') == zk_path

    # topic created earlier against default zk should no longer be present:
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == []
Exemplo n.º 13
0
def test_config_cli():
    configs = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config list', json=True)
    assert len(configs) >= 1  # refrain from breaking this test if earlier tests did a config update

    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config show {}'.format(configs[0]), print_output=False) # noisy output
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config target', json=True)
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config target_id', json=True)
Exemplo n.º 14
0
def test_canary_first():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy')

    expected_tasks = ['hello-0']
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks

    # do not use service_plan always
    # when here, plan should always return properly
    pl = sdk_plan.wait_for_completed_step(config.SERVICE_NAME, 'deploy', 'hello-deploy', 'hello-0:[server]')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
Exemplo n.º 15
0
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == 'node-2-server'][0]
    log.info('avoid host for task {}'.format(replace_task))

    replace_pod_name = replace_task.name[:-len('-server')]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT']
    try:
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host)
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
        sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

    finally:
        # revert to prior placement setting before proceeding with tests: avoid getting stuck.
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
Exemplo n.º 16
0
def test_canary_fourth():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy world-deploy')

    expected_tasks = [
        'hello-0', 'hello-1', 'hello-2', 'hello-3',
        'world-0', 'world-1', 'world-2', 'world-3']
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks

    pl = sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'deploy')
    log.info(pl)

    assert pl['status'] == 'COMPLETE'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
Exemplo n.º 17
0
def test_node_replace_replaces_seed_node():
    pod_to_replace = 'node-0'

    # start replace and wait for it to finish
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
Exemplo n.º 18
0
def test_custom_zookeeper():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    broker_ids = sdk_tasks.get_task_ids(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE))

    # create a topic against the default zk:
    test_utils.create_topic(config.DEFAULT_TOPIC_NAME, service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    # should be using default path when this envvar is empty/unset:
    assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == ''

    # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall:
    zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(sdk_utils.get_zk_path(foldered_name))
    marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path
    sdk_marathon.update_app(foldered_name, marathon_config)

    sdk_tasks.check_tasks_updated(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids)
    sdk_plan.wait_for_completed_deployment(foldered_name)

    # wait for brokers to finish registering
    test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=foldered_name)

    zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints zookeeper')
    assert zookeeper.rstrip('\n') == zk_path

    # topic created earlier against default zk should no longer be present:
    topic_list_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'topic list', json=True)

    test_utils.assert_topic_lists_are_equal_without_automatic_topics([], topic_list_info)
def test_zones_not_referenced_in_placement_constraints():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)

    sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
    sdk_install.install(
        config.PACKAGE_NAME,
        foldered_name,
        config.DEFAULT_BROKER_COUNT,
        additional_options={
            "service": {
                "name": foldered_name
            }
        })

    test_utils.broker_count_check(
        config.DEFAULT_BROKER_COUNT, service_name=foldered_name)

    broker_ids = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, 'broker list', json=True)

    for broker_id in broker_ids:
        broker_info = sdk_cmd.svc_cli(
            config.PACKAGE_NAME,
            foldered_name,
            'broker get {}'.format(broker_id),
            json=True)

        assert broker_info.get('rack') == None

    sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
Exemplo n.º 20
0
def test_pod_restart():
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0")

    # get current agent id:
    rc, stdout, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False
    )
    assert rc == 0, "Pod info failed"
    old_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"]

    rc, stdout, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart hello-0"
    )
    assert rc == 0, "Pod restart failed"
    jsonobj = json.loads(stdout)
    assert len(jsonobj) == 2
    assert jsonobj["pod"] == "hello-0"
    assert len(jsonobj["tasks"]) == 1
    assert jsonobj["tasks"][0] == "hello-0-server"

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", hello_ids)
    check_healthy()

    # check agent didn't move:
    rc, stdout, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False
    )
    assert rc == 0, "Second pod info failed"
    new_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"]
    assert old_agent == new_agent
Exemplo n.º 21
0
 def check_cache_refresh_fails_409conflict():
     try:
         sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache')
     except Exception as e:
         if "failed: 409 Conflict" in e.args[0]:
             return True
     return False
Exemplo n.º 22
0
def test_topic_partition_count():
    sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True)
    topic_info = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'topic describe {}'.format(config.DEFAULT_TOPIC_NAME), json=True)
    assert len(topic_info['partitions']) == config.DEFAULT_PARTITION_COUNT
Exemplo n.º 23
0
def test_updated_placement_constraints_replaced_tasks_do_move():
    some_agent, other_agent, old_ids = setup_constraint_switch()

    # Replace the task, and verify it moves hosts
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', old_ids)

    assert get_task_host('hello-0-server') == other_agent
Exemplo n.º 24
0
def delete_topic(service_name=config.SERVICE_NAME):
    delete_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'topic delete {}'.format(EPHEMERAL_TOPIC_NAME), json=True)
    assert len(delete_info) == 1
    assert delete_info['message'].startswith('Output: Topic {} is marked for deletion'.format(EPHEMERAL_TOPIC_NAME))

    topic_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'topic describe {}'.format(EPHEMERAL_TOPIC_NAME), json=True)
    assert len(topic_info) == 1
    assert len(topic_info['partitions']) == config.DEFAULT_PARTITION_COUNT
Exemplo n.º 25
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
    pods_with_updated_tasks: Optional[List[str]] = None,
) -> None:
    """
    Perform a replace (permanent recovery) operation on the specified pod.

    The specified pod AND any additional pods in `pods_with_updated_tasks` are
    checked to ensure that their tasks have been restarted.

    Any remaining pods are checked to ensure that their tasks are not changed.

    For example, performing a pod replace kafka-0 on a Kafka framework should
    result in ONLY the kafa-0-broker task being restarted. In this case,
    pods_with_updated_tasks is specified as None.

    When performing a pod replace operation on a Cassandra seed node (node-0),
    a rolling restart of other nodes is triggered, and
    pods_with_updated_tasks = ["node-0", "node-1", "node-2"]
    (assuming a three node Cassandra ring)
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list")
    assert rc == 0, "Pod list failed"
    pod_list = set(json.loads(stdout))

    pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else []
    pods_to_update = set(pods_with_updated_tasks + [pod_name])

    tasks_to_replace = {}
    for pod in pods_to_update:
        tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name))

    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list - pods_to_update:
        tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods)

    sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name))

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    for pod, tasks in tasks_to_replace.items():
        sdk_tasks.check_tasks_updated(service_name, pod, tasks)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_authz_acls_required(kafka_client, kafka_server, kerberos):
    client_id = kafka_client["id"]

    sdk_cmd.resolve_hosts(kafka_client["id"], kafka_client["brokers"])

    topic_name = "authz.test"
    sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"],
                    "topic create {}".format(topic_name),
                    json=True)

    test_utils.wait_for_topic(kafka_server["package_name"], kafka_server["service"]["name"], topic_name)

    message = str(uuid.uuid4())

    log.info("Writing and reading: Writing to the topic, but not super user")
    assert not write_to_topic("authorized", client_id, topic_name, message, kerberos)

    log.info("Writing and reading: Writing to the topic, as super user")
    assert write_to_topic("super", client_id, topic_name, message, kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    assert auth.is_not_authorized(read_from_topic("authorized", client_id, topic_name, 1, kerberos))

    log.info("Writing and reading: Reading from the topic, as super user")
    assert message in read_from_topic("super", client_id, topic_name, 1, kerberos)

    zookeeper_endpoint = sdk_cmd.svc_cli(
        kafka_server["package_name"],
        kafka_server["service"]["name"],
        "endpoint zookeeper").strip()

    # TODO: If zookeeper has Kerberos enabled, then the environment should be changed
    topics.add_acls("authorized", client_id, topic_name, zookeeper_endpoint, env_str=None)

    # Send a second message which should not be authorized
    second_message = str(uuid.uuid4())
    log.info("Writing and reading: Writing to the topic, but not super user")
    assert write_to_topic("authorized", client_id, topic_name, second_message, kerberos)

    log.info("Writing and reading: Writing to the topic, as super user")
    assert write_to_topic("super", client_id, topic_name, second_message, kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    topic_output = read_from_topic("authorized", client_id, topic_name, 3, kerberos)
    assert message in topic_output
    assert second_message in topic_output

    log.info("Writing and reading: Reading from the topic, as super user")
    topic_output = read_from_topic("super", client_id, topic_name, 3, kerberos)
    assert message in topic_output
    assert second_message in topic_output

    # Check that the unauthorized client can still not read or write from the topic.
    log.info("Writing and reading: Writing to the topic, but not super user")
    assert not write_to_topic("unauthorized", client_id, topic_name, second_message, kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    assert auth.is_not_authorized(read_from_topic("unauthorized", client_id, topic_name, 1, kerberos))
Exemplo n.º 27
0
def test_service_startup_rapid():
    max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS
    startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS
    retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS

    task_short_name = 'kafka-0'
    broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # the following 'dcos kafka topic ....' command has expected output as follows:
    # 'Output: 100 records sent ....'
    # but may fail, i.e. have output such as follows:
    # '...leader not available...'
    stdout = ''
    retries = 15
    while retries > 0:
        retries -= 1
        stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100')
        if 'records sent' in stdout:
            break

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True)
    assert len(jsonobj) == 2
    assert jsonobj['pod'] == task_short_name
    assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ]

    starting_fallback_time = datetime.datetime.now()

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ])
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # extract starting and started lines from log
    starting_time = started_time = None
    retry_seconds_remaining = max_restart_seconds + startup_padding_seconds
    while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None):
        stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1))
        task_lines = stdout.split('\n')
        for log_line in reversed(task_lines):
            if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line:
                starting_time = log_line_ts(log_line)
            elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line:
                started_time = log_line_ts(log_line)
        if starting_time is None or started_time is None:
            time.sleep(retry_delay_seconds)

    if started_time is None or starting_time is None:
        f = open('/tmp/kafka_startup_stdout', 'w')
        f.write(stdout)
        f.close()

    if starting_time is None:
        starting_time = starting_fallback_time

    assert starting_time is not None
    assert started_time is not None
    assert started_time >= starting_time
    assert (started_time - starting_time).total_seconds() <= max_restart_seconds
Exemplo n.º 28
0
def replace_broker_pod(service_name=config.SERVICE_NAME):
    pod_name = '{}-0'.format(config.DEFAULT_POD_TYPE)
    task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME)
    broker_0_id = sdk_tasks.get_task_ids(service_name, task_name)
    sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod replace {}'.format(pod_name))
    sdk_tasks.check_tasks_updated(service_name, task_name, broker_0_id)
    sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT)
    # wait till all brokers register
    broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=service_name)
Exemplo n.º 29
0
def test_updated_placement_constraints_replaced_tasks_do_move():
    some_agent, other_agent, old_ids = setup_constraint_switch()

    # Replace the task, and verify it moves hosts
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0")
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", old_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    assert get_task_host("hello-0-server") == other_agent
Exemplo n.º 30
0
def test_config_cli():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    configs = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config list', json=True)
    assert len(configs) >= 1  # refrain from breaking this test if earlier tests did a config update

    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
        'debug config show {}'.format(configs[0]), print_output=False) # noisy output
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config target', json=True)
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config target_id', json=True)
Exemplo n.º 31
0
 def describe(topic):
     sdk_cmd.svc_cli(package_name,
                     service_name,
                     "topic describe {}".format(topic),
                     json=True)
Exemplo n.º 32
0
def test_no_unavailable_partitions_exist():
    partition_info = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                     sdk_utils.get_foldered_name(
                                         config.SERVICE_NAME),
                                     'topic unavailable_partitions',
                                     json=True)
Exemplo n.º 33
0
def kill_driver(driver_id, service_name=SPARK_SERVICE_NAME):
    return sdk_cmd.svc_cli(SPARK_PACKAGE_NAME, service_name,
                           "kill {}".format(driver_id))
Exemplo n.º 34
0
def get_metrics(package_name, service_name, task_name):
    """Return a list of DC/OS metrics datapoints.

    Keyword arguments:
    package_name -- the name of the package the service is using
    service_name -- the name of the service to get metrics for
    task_name -- the name of the task whose agent to run metrics commands from
    """
    tasks = shakedown.get_service_tasks(service_name)
    for task in tasks:
        if task['name'] == task_name:
            task_to_check = task

    if task_to_check is None:
        raise Exception("Could not find task")

    agent_id = task_to_check['slave_id']
    executor_id = task_to_check['executor_id']

    pod_name = '-'.join(task_name.split("-")[:2])
    pod_info = sdk_cmd.svc_cli(package_name,
                               service_name,
                               "pod info {}".format(pod_name),
                               json=True)
    task_info = None
    for task in pod_info:
        if task["info"]["name"] == task_name:
            task_info = task
            break

    if not task_info:
        return []

    task_container_id = task_info["status"]["containerStatus"]["containerId"][
        "value"]

    # Not related to functionality but consuming this
    # endpoint to verify downstream integrity
    containers_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers".format(agent_id),
        retry=False)
    reported_container_ids = json.loads(containers_response.text)

    container_id_reported = False
    for container_id in reported_container_ids:
        if container_id == task_container_id:
            container_id_reported = True

    if not container_id_reported:
        raise ValueError(
            "The metrics /container endpoint returned {}, expecting {} to be returned as well"
            .format(reported_container_ids, task_container_id))

    app_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(
            agent_id, task_container_id),
        retry=False)
    app_json = json.loads(app_response.text)
    if app_json['dimensions']['executor_id'] == executor_id:
        return app_json['datapoints']

    raise Exception("No metrics found")
Exemplo n.º 35
0
def test_topic_offsets_increase_with_writes(kafka_server: dict):
    package_name = kafka_server["package_name"]
    service_name = kafka_server["service"]["name"]

    def offset_is_valid(result) -> bool:
        initial = result[0]
        offsets = result[1]

        LOG.info("Checking validity with initial=%s offsets=%s", initial,
                 offsets)
        has_elements = bool(
            topics.filter_empty_offsets(offsets, additional=initial))
        # The return of this function triggers the restart.
        return not has_elements

    @retrying.retry(wait_exponential_multiplier=1000,
                    wait_exponential_max=60 * 1000,
                    retry_on_result=offset_is_valid)
    def get_offset_change(topic_name, initial_offsets=[]):
        """
        Run:
            `dcos kafa topic offsets --time="-1"`
        until the output is not the initial output specified
        """
        LOG.info("Getting offsets for %s", topic_name)
        offsets = sdk_cmd.svc_cli(
            package_name,
            service_name,
            'topic offsets --time="-1" {}'.format(topic_name),
            json=True)
        LOG.info("offsets=%s", offsets)
        return initial_offsets, offsets

    topic_name = str(uuid.uuid4())
    LOG.info("Creating topic: %s", topic_name)
    test_utils.create_topic(topic_name, service_name)

    _, offset_info = get_offset_change(topic_name)

    # offset_info is a list of (partition index, offset) key-value pairs sum the
    # integer representations of the offsets
    initial_offset = sum(
        map(lambda partition: sum(map(int, partition.values())), offset_info))
    LOG.info("Initial offset=%s", initial_offset)

    num_messages = 10
    LOG.info("Sending %s messages", num_messages)
    write_info = sdk_cmd.svc_cli(package_name,
                                 service_name,
                                 'topic producer_test {} {}'.format(
                                     topic_name, num_messages),
                                 json=True)
    assert len(write_info) == 1
    assert write_info['message'].startswith(
        'Output: {} records sent'.format(num_messages))

    _, post_write_offset_info = get_offset_change(topic_name, offset_info)

    post_write_offset = sum(
        map(lambda partition: sum(map(int, partition.values())),
            post_write_offset_info))
    LOG.info("Post-write offset=%s", post_write_offset)

    assert post_write_offset > initial_offset
Exemplo n.º 36
0
 def check_cache_refresh_fails_409conflict():
     output = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                              'state refresh_cache')
     if "failed: 409 Conflict" in output:
         return True
     return False
Exemplo n.º 37
0
def test_overlay_network():
    """Verify that the current deploy plan matches the expected plan from the spec."""

    deployment_plan = sdk_plan.wait_for_completed_deployment(
        config.SERVICE_NAME)
    log.info("deployment_plan: " + str(deployment_plan))

    # test that the deployment plan is correct
    assert (len(deployment_plan['phases']) == 5)
    assert (deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy')
    assert (deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy')
    assert (deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy')
    assert (deployment_plan['phases'][3]['name'] == 'hello-host-deploy')
    assert (deployment_plan["phases"][4]["name"] == "getter-deploy")
    assert (len(deployment_plan['phases'][0]['steps']) == 1)
    assert (len(deployment_plan["phases"][1]["steps"]) == 1)
    assert (len(deployment_plan["phases"][2]["steps"]) == 1)
    assert (len(deployment_plan["phases"][3]["steps"]) == 1)
    assert (len(deployment_plan["phases"][4]["steps"]) == 1)

    # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so,
    # we wait for them to redeploy, but if they don't fail we still want to proceed.
    try:
        sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME,
                                               timeout_seconds=60)
        sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME,
                                             timeout_seconds=60)
    except retrying.RetryError:
        pass

    # test that the tasks are all up, which tests the overlay DNS
    framework_tasks = [
        task for task in shakedown.get_service_tasks(config.SERVICE_NAME,
                                                     completed=False)
    ]
    framework_task_names = [t["name"] for t in framework_tasks]

    for expected_task in EXPECTED_TASKS:
        assert (expected_task
                in framework_task_names), "Missing {expected}".format(
                    expected=expected_task)

    for task in framework_tasks:
        name = task["name"]
        if "getter" in name:  # don't check the "getter" tasks because they don't use ports
            continue
        resources = task["resources"]
        if "host" in name:
            assert "ports" in resources.keys(
            ), "Task {} should have port resources".format(name)
        if "overlay" in name:
            assert "ports" not in resources.keys(
            ), "Task {} should NOT have port resources".format(name)

    sdk_networks.check_task_network("hello-overlay-0-server")
    sdk_networks.check_task_network("hello-overlay-vip-0-server")
    sdk_networks.check_task_network("hello-host-0-server",
                                    expected_network_name=None)
    sdk_networks.check_task_network("hello-host-vip-0-server",
                                    expected_network_name=None)

    endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                       config.SERVICE_NAME,
                                       'endpoints',
                                       json=True)
    assert len(endpoints_result
               ) == 2, "Wrong number of endpoints got {} should be 2".format(
                   len(endpoints_result))

    overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                               config.SERVICE_NAME,
                                               'endpoints overlay-vip',
                                               json=True)
    assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\
           "{}".format(overlay_endpoints_result)
    assert len(overlay_endpoints_result["address"]) == 1
    assert overlay_endpoints_result["address"][0].startswith("9")
    overlay_port = overlay_endpoints_result["address"][0].split(":")[-1]
    assert overlay_port == "4044"
    assert "dns" in overlay_endpoints_result.keys()
    assert len(overlay_endpoints_result["dns"]) == 1
    assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044)

    host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                            config.SERVICE_NAME,
                                            'endpoints host-vip',
                                            json=True)
    assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\
           "{}".format(host_endpoints_result)
    assert len(host_endpoints_result["address"]) == 1
    assert host_endpoints_result["address"][0].startswith("10")
    host_port = host_endpoints_result["address"][0].split(":")[-1]
    assert host_port == "4044"
    assert "dns" in host_endpoints_result.keys()
    assert len(host_endpoints_result["dns"]) == 1
    assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host(
        config.SERVICE_NAME, "hello-host-vip-0-server", 4044)
Exemplo n.º 38
0
def _check_json_output(svc_name, cmd):
    rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, svc_name, cmd)
    assert rc == 0, "Command failed: {}".format(cmd)
    # Check that stdout is valid json:
    json.loads(stdout)
Exemplo n.º 39
0
 def check_cache_refresh():
     rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                                     "debug state refresh_cache")
     assert rc == 0, "Refresh cache failed"
     return stdout
Exemplo n.º 40
0
 def check_cache_refresh_fails_409conflict():
     rc, stdout, stderr = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                          foldered_name,
                                          "debug state refresh_cache")
     return rc != 0 and stdout == "" and "failed: 409 Conflict" in stderr
Exemplo n.º 41
0
def test_help_cli():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "help")
Exemplo n.º 42
0
def test_authz_acls_not_required(kafka_client, service_account, setup_principals):

    try:
        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
        service_options = {
            "service": {
                "name": config.SERVICE_NAME,
                "service_account": service_account["name"],
                "service_account_secret": service_account["secret"],
                "security": {
                    "transport_encryption": {"enabled": True},
                    "ssl_authentication": {"enabled": True},
                    "authorization": {
                        "enabled": True,
                        "super_users": "User:{}".format("super"),
                        "allow_everyone_if_no_acl_found": True,
                    },
                },
            }
        }
        config.install(
            config.PACKAGE_NAME,
            config.SERVICE_NAME,
            config.DEFAULT_BROKER_COUNT,
            additional_options=service_options,
        )

        kafka_server = {**service_options, **{"package_name": config.PACKAGE_NAME}}

        topic_name = "authz.test"
        sdk_cmd.svc_cli(
            kafka_server["package_name"],
            kafka_server["service"]["name"],
            "topic create {}".format(topic_name),
        )

        kafka_client.connect(kafka_server)

        # Since no ACLs are specified, all users can read and write.
        for user in ["authorized", "unauthorized", "super"]:
            log.info("Checking write / read permissions for user=%s", user)
            write_success, read_successes, _ = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, None
            )
            assert write_success, "Write failed (user={})".format(user)
            assert read_successes, (
                "Read failed (user={}): "
                "MESSAGES={} "
                "read_successes={}".format(user, kafka_client.MESSAGES, read_successes)
            )

        log.info("Writing and reading: Adding acl for authorized user")
        kafka_client.add_acls("authorized", kafka_server, topic_name)

        # After adding ACLs the authorized user and super user should still have access to the topic.
        for user in ["authorized", "super"]:
            log.info("Checking write / read permissions for user=%s", user)
            write_success, read_successes, _ = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, None
            )
            assert write_success, "Write failed (user={})".format(user)
            assert read_successes, (
                "Read failed (user={}): "
                "MESSAGES={} "
                "read_successes={}".format(user, kafka_client.MESSAGES, read_successes)
            )

        for user in ["unauthorized"]:
            log.info("Checking lack of write / read permissions for user=%s", user)
            write_success, _, read_messages = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, None
            )
            assert not write_success, "Write not expected to succeed (user={})".format(user)
            assert auth.is_not_authorized(read_messages), "Unauthorized expected (user={}".format(
                user
            )

    finally:
        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
Exemplo n.º 43
0
def test_master_node_replace():
    # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters
    # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups.
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace master-0')
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Exemplo n.º 44
0
def test_endpoints_zookeeper_default():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    _, zookeeper, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "endpoints zookeeper")
    assert zookeeper.rstrip("\n") == "master.mesos:2181/{}".format(
        sdk_utils.get_zk_path(foldered_name)
    )
Exemplo n.º 45
0
def test_state_cli():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "state framework_id", parse_json=True)[1]
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "state properties", parse_json=True)[1]
Exemplo n.º 46
0
def test_service_startup_rapid():
    max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS
    startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS
    retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS

    task_short_name = 'kafka-0'
    broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                              task_short_name)[0]

    # the following 'dcos kafka topic ....' command has expected output as follows:
    # 'Output: 100 records sent ....'
    # but may fail, i.e. have output such as follows:
    # '...leader not available...'
    stdout = ''
    retries = 15
    while retries > 0:
        retries -= 1
        stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                                 'topic producer_test test 100')
        if 'records sent' in stdout:
            break

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                              config.SERVICE_NAME,
                              'pod restart {}'.format(task_short_name),
                              json=True)
    assert len(jsonobj) == 2
    assert jsonobj['pod'] == task_short_name
    assert jsonobj['tasks'] == ['{}-broker'.format(task_short_name)]

    starting_fallback_time = datetime.datetime.now()

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME,
                                  '{}-'.format(config.DEFAULT_POD_TYPE),
                                  [broker_task_id_0])
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                              task_short_name)[0]

    # extract starting and started lines from log
    starting_time = started_time = None
    retry_seconds_remaining = max_restart_seconds + startup_padding_seconds
    while retry_seconds_remaining > 0.0 and (starting_time is None
                                             or started_time is None):
        stdout = sdk_cmd.run_cli(
            "task log --lines=1000 {}".format(broker_task_id_1))
        task_lines = stdout.split('\n')
        for log_line in reversed(task_lines):
            if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line:
                starting_time = log_line_ts(log_line)
            elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line:
                started_time = log_line_ts(log_line)
        if starting_time is None or started_time is None:
            time.sleep(retry_delay_seconds)

    if started_time is None or starting_time is None:
        f = open('/tmp/kafka_startup_stdout', 'w')
        f.write(stdout)
        f.close()

    if starting_time is None:
        starting_time = starting_fallback_time

    assert starting_time is not None
    assert started_time is not None
    assert started_time >= starting_time
    assert (started_time -
            starting_time).total_seconds() <= max_restart_seconds
Exemplo n.º 47
0
def get_metrics(package_name, service_name, pod_name, task_name):
    """Return a list of DC/OS metrics datapoints.

    Keyword arguments:
    package_name -- the name of the package the service is using
    service_name -- the name of the service to get metrics for
    task_name -- the name of the task whose agent to run metrics commands from
    """

    # Find task entry in mesos state:
    tasks = sdk_tasks.get_service_tasks(service_name)
    for task in tasks:
        if task.name == task_name:
            task_to_check = task
            break
    if task_to_check is None:
        raise Exception("Task named {} not found in service {}: {}".format(
            task_name, service_name, tasks))

    # Find task's container id via recent TaskStatus:
    rc, stdout, _ = sdk_cmd.svc_cli(package_name,
                                    service_name,
                                    "pod info {}".format(pod_name),
                                    print_output=False)
    assert rc == 0, "Pod info failed"
    pod_info = json.loads(stdout)
    task_container_id = None
    for task in pod_info:
        if task["info"]["name"] == task_name:
            task_container_id = task["status"]["containerStatus"][
                "containerId"]["value"]
            break
    if task_container_id is None:
        log.warning("Task named {} not found in pod {}: {}".format(
            task_name, pod_name, pod_info))
        return []

    # Not related to functionality, but consuming this endpoint to verify metrics integrity
    containers_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers".format(
            task_to_check.agent_id),
        retry=False,
    )
    reported_container_ids = json.loads(containers_response.text)

    container_id_reported = False
    for container_id in reported_container_ids:
        if container_id == task_container_id:
            container_id_reported = True
            break
    if not container_id_reported:
        raise ValueError(
            "The metrics /container endpoint returned {} for agent {}, expected {} to be returned as well"
            .format(reported_container_ids, task_to_check.agent_id,
                    task_container_id))

    app_response = sdk_cmd.cluster_request(
        "GET",
        "/system/v1/agent/{}/metrics/v0/containers/{}/app".format(
            task_to_check.agent_id, task_container_id),
        retry=False,
    )
    app_json = json.loads(app_response.text)
    if app_json["dimensions"]["executor_id"] == task_to_check.executor_id:
        return app_json["datapoints"]

    raise Exception("No metrics found for task {} in service {}".format(
        task_name, service_name))
Exemplo n.º 48
0
def get_pod_type_instances(pod_type_prefix, service_name=SERVICE_NAME):
    pod_types = sdk_cmd.svc_cli(PACKAGE_NAME, service_name, 'pod list', json=True)
    return [pod_type for pod_type in pod_types if pod_type.startswith(pod_type_prefix)]
Exemplo n.º 49
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
    pods_with_updated_tasks: typing.List[str] = None,
):
    """
    Perform a replace (permanent recovery) operation on the specified pod.

    The specified pod AND any additional pods in `pods_with_updated_tasks` are
    checked to ensure that their tasks have been restarted.

    Any remaining pods are checked to ensure that their tasks are not changed.

    For example, performing a pod replace kafka-0 on a Kafka framework should
    result in ONLY the kafa-0-broker task being restarted. In this case,
    pods_with_updated_tasks is specified as None.

    When performing a pod replace operation on a Cassandra seed node (node-0),
    a rolling restart of other nodes is triggered, and
    pods_with_updated_tasks = ["node-0", "node-1", "node-2"]
    (assuming a three node Cassandra ring)
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    pod_list = set(
        sdk_cmd.svc_cli(package_name, service_name, "pod list", json=True))

    pods_to_update = set(
        pods_with_updated_tasks if pods_with_updated_tasks else [] +
        [pod_name])

    tasks_to_replace = {}
    for pod in pods_to_update:
        tasks_to_replace[pod] = set(
            sdk_tasks.get_task_ids(service_name, pod_name))

    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list - pods_to_update:
        tasks_in_other_pods[pod] = set(
            sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s",
             tasks_in_other_pods)

    replace_cmd = ["pod", "replace", pod_name]
    sdk_cmd.svc_cli(package_name,
                    service_name,
                    " ".join(replace_cmd),
                    json=True)

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    for pod, tasks in tasks_to_replace.items():
        sdk_tasks.check_tasks_updated(service_name, pod, tasks)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
Exemplo n.º 50
0
def test_help_cli():
    sdk_cmd.svc_cli(config.PACKAGE_NAME,
                    sdk_utils.get_foldered_name(config.SERVICE_NAME), 'help')
Exemplo n.º 51
0
def test_authz_acls_required(kafka_client, kafka_server, kerberos):
    client_id = kafka_client["id"]

    auth.wait_for_brokers(kafka_client["id"], kafka_client["brokers"])

    topic_name = "authz.test"
    sdk_cmd.svc_cli(kafka_server["package_name"],
                    kafka_server["service"]["name"],
                    "topic create {}".format(topic_name),
                    json=True)

    test_utils.wait_for_topic(kafka_server["package_name"],
                              kafka_server["service"]["name"], topic_name)

    message = str(uuid.uuid4())

    log.info("Writing and reading: Writing to the topic, but not super user")
    assert not write_to_topic("authorized", client_id, topic_name, message,
                              kerberos)

    log.info("Writing and reading: Writing to the topic, as super user")
    assert write_to_topic("super", client_id, topic_name, message, kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    assert auth.is_not_authorized(
        read_from_topic("authorized", client_id, topic_name, 1, kerberos))

    log.info("Writing and reading: Reading from the topic, as super user")
    assert message in read_from_topic("super", client_id, topic_name, 1,
                                      kerberos)

    zookeeper_endpoint = sdk_cmd.svc_cli(kafka_server["package_name"],
                                         kafka_server["service"]["name"],
                                         "endpoint zookeeper").strip()

    # TODO: If zookeeper has Kerberos enabled, then the environment should be changed
    topics.add_acls("authorized",
                    client_id,
                    topic_name,
                    zookeeper_endpoint,
                    env_str=None)

    # Send a second message which should not be authorized
    second_message = str(uuid.uuid4())
    log.info("Writing and reading: Writing to the topic, but not super user")
    assert write_to_topic("authorized", client_id, topic_name, second_message,
                          kerberos)

    log.info("Writing and reading: Writing to the topic, as super user")
    assert write_to_topic("super", client_id, topic_name, second_message,
                          kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    topic_output = read_from_topic("authorized", client_id, topic_name, 3,
                                   kerberos)
    assert message in topic_output
    assert second_message in topic_output

    log.info("Writing and reading: Reading from the topic, as super user")
    topic_output = read_from_topic("super", client_id, topic_name, 3, kerberos)
    assert message in topic_output
    assert second_message in topic_output

    # Check that the unauthorized client can still not read or write from the topic.
    log.info("Writing and reading: Writing to the topic, but not super user")
    assert not write_to_topic("unauthorized", client_id, topic_name,
                              second_message, kerberos)

    log.info("Writing and reading: Reading from the topic, but not super user")
    assert auth.is_not_authorized(
        read_from_topic("unauthorized", client_id, topic_name, 1, kerberos))
Exemplo n.º 52
0
    def restart_zookeeper_node(id: int):
        sdk_cmd.svc_cli(ZK_PACKAGE, ZK_SERVICE_NAME, "pod restart zookeeper-{}".format(id))

        sdk_plan.wait_for_kicked_off_recovery(ZK_SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(ZK_SERVICE_NAME)
Exemplo n.º 53
0
def test_authz_acls_required(kafka_client: client.KafkaClient,
                             kafka_server: dict,
                             kerberos: sdk_auth.KerberosEnvironment):

    topic_name = "authz.test"
    sdk_cmd.svc_cli(kafka_server["package_name"],
                    kafka_server["service"]["name"],
                    "topic create {}".format(topic_name),
                    json=True)

    kafka_client.connect(kafka_server)

    # Since no ACLs are specified, only the super user can read and write
    for user in [
            "super",
    ]:
        log.info("Checking write / read permissions for user=%s", user)
        write_success, read_successes, _ = kafka_client.can_write_and_read(
            user, kafka_server, topic_name, kerberos)
        assert write_success, "Write failed (user={})".format(user)
        assert read_successes, "Read failed (user={}): " \
                               "MESSAGES={} " \
                               "read_successes={}".format(user,
                                                          kafka_client.MESSAGES,
                                                          read_successes)

    for user in [
            "authorized",
            "unauthorized",
    ]:
        log.info("Checking lack of write / read permissions for user=%s", user)
        write_success, _, read_messages = kafka_client.can_write_and_read(
            user, kafka_server, topic_name, kerberos)
        assert not write_success, "Write not expected to succeed (user={})".format(
            user)
        assert auth.is_not_authorized(
            read_messages), "Unauthorized expected (user={}".format(user)

    log.info("Writing and reading: Adding acl for authorized user")
    kafka_client.add_acls("authorized", kafka_server, topic_name)

    # After adding ACLs the authorized user and super user should still have access to the topic.
    for user in ["authorized", "super"]:
        log.info("Checking write / read permissions for user=%s", user)
        write_success, read_successes, _ = kafka_client.can_write_and_read(
            user, kafka_server, topic_name, kerberos)
        assert write_success, "Write failed (user={})".format(user)
        assert read_successes, "Read failed (user={}): " \
                               "MESSAGES={} " \
                               "read_successes={}".format(user,
                                                          kafka_client.MESSAGES,
                                                          read_successes)

    for user in [
            "unauthorized",
    ]:
        log.info("Checking lack of write / read permissions for user=%s", user)
        write_success, _, read_messages = kafka_client.can_write_and_read(
            user, kafka_server, topic_name, kerberos)
        assert not write_success, "Write not expected to succeed (user={})".format(
            user)
        assert auth.is_not_authorized(
            read_messages), "Unauthorized expected (user={}".format(user)
Exemplo n.º 54
0
 def check_cache_refresh():
     return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                            'state refresh_cache')
Exemplo n.º 55
0
def test_authz_acls_not_required(kafka_client: client.KafkaClient,
                                 zookeeper_server, kerberos):
    try:
        zookeeper_dns = sdk_cmd.svc_cli(zookeeper_server["package_name"],
                                        zookeeper_server["service"]["name"],
                                        "endpoint clientport",
                                        json=True)["dns"]

        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
        service_options = {
            "service": {
                "name": config.SERVICE_NAME,
                "security": {
                    "kerberos": {
                        "enabled": True,
                        "enabled_for_zookeeper": True,
                        "kdc": {
                            "hostname": kerberos.get_host(),
                            "port": int(kerberos.get_port())
                        },
                        "realm": kerberos.get_realm(),
                        "keytab_secret": kerberos.get_keytab_path(),
                    },
                    "authorization": {
                        "enabled": True,
                        "super_users": "User:{}".format("super"),
                        "allow_everyone_if_no_acl_found": True
                    }
                }
            },
            "kafka": {
                "kafka_zookeeper_uri": ",".join(zookeeper_dns)
            }
        }

        config.install(config.PACKAGE_NAME,
                       config.SERVICE_NAME,
                       config.DEFAULT_BROKER_COUNT,
                       additional_options=service_options)

        kafka_server = {
            **service_options,
            **{
                "package_name": config.PACKAGE_NAME
            }
        }

        topic_name = "authz.test"
        sdk_cmd.svc_cli(kafka_server["package_name"],
                        kafka_server["service"]["name"],
                        "topic create {}".format(topic_name),
                        json=True)

        kafka_client.connect(kafka_server)

        # Clear the ACLs
        kafka_client.remove_acls("authorized", kafka_server, topic_name)

        # Since no ACLs are specified, all users can read and write.
        for user in [
                "authorized",
                "unauthorized",
                "super",
        ]:
            log.info("Checking write / read permissions for user=%s", user)
            write_success, read_successes, _ = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, kerberos)
            assert write_success, "Write failed (user={})".format(user)
            assert read_successes, "Read failed (user={}): " \
                                   "MESSAGES={} " \
                                   "read_successes={}".format(user,
                                                              kafka_client.MESSAGES,
                                                              read_successes)

        log.info("Writing and reading: Adding acl for authorized user")
        kafka_client.add_acls("authorized", kafka_server, topic_name)

        # After adding ACLs the authorized user and super user should still have access to the topic.
        for user in [
                "authorized",
                "super",
        ]:
            log.info("Checking write / read permissions for user=%s", user)
            write_success, read_successes, _ = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, kerberos)
            assert write_success, "Write failed (user={})".format(user)
            assert read_successes, "Read failed (user={}): " \
                                   "MESSAGES={} " \
                                   "read_successes={}".format(user,
                                                              kafka_client.MESSAGES,
                                                              read_successes)

        for user in [
                "unauthorized",
        ]:
            log.info("Checking lack of write / read permissions for user=%s",
                     user)
            write_success, _, read_messages = kafka_client.can_write_and_read(
                user, kafka_server, topic_name, kerberos)
            assert not write_success, "Write not expected to succeed (user={})".format(
                user)
            assert auth.is_not_authorized(
                read_messages), "Unauthorized expected (user={}".format(user)

    finally:
        sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
Exemplo n.º 56
0
 def check_for_nonempty_properties():
     jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                               foldered_name,
                               'state properties',
                               json=True)
     return len(jsonobj) > 0
Exemplo n.º 57
0
def test_endpoints_zookeeper_default():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name,
                                'endpoints zookeeper')
    assert zookeeper.rstrip('\n') == 'master.mesos:2181/{}'.format(
        sdk_utils.get_zk_path(foldered_name))
Exemplo n.º 58
0
def test_coordinator_node_replace():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace coordinator-0')
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Exemplo n.º 59
0
def broker_count_check(count, service_name=config.SERVICE_NAME):
    brokers = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                              service_name,
                              'broker list',
                              json=True)
    return len(brokers) == count
Exemplo n.º 60
0
 def get_zookeeper_connect(self) -> str:
     return str(
         sdk_cmd.svc_cli(self._package_name, self._service_name,
                         "endpoint zookeeper")).strip()