def test_config_update_eventually_succeeds_after_all_brokers_fail(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( increment_broker_port_config, lambda: [kill_task_with_pattern('kafka.Kafka', h) for h in hosts]) check_health()
def test_service_becomes_healthy_after_agent_is_partitioned(): host = get_broker_host() spin(shakedown.partition_agent, lambda x: (True, ''), host) shakedown.reconnect_agent(host) check_health()
def test_upgrade_downgrade(): # Ensure both Universe and the test repo exist. if len(shakedown.get_package_repos()['repositories']) != 2: print('No test repo found. Skipping test_upgrade_downgrade') return test_version = get_pkg_version() print('Found test version: {}'.format(test_version)) add_repo(MASTER_CUSTOM_NAME, MASTER_CUSTOM_URL, prev_version=test_version) master_version = get_pkg_version() print('Found master version: {}'.format(master_version)) print('Installing master version') install({'package_version': master_version}) check_health() write_some_data("data-0-node.hdfs.mesos", TEST_FILE_NAME) # gives chance for write to succeed and replication to occur time.sleep(5) print('Upgrading to test version') destroy_and_install(test_version) check_health_after_version_change() print('Downgrading to master version') destroy_and_install(master_version) check_health_after_version_change() # clean up remove_repo(prev_version=master_version)
def test_config_update_eventually_succeeds_after_scheduler_fails(): host = get_scheduler_host() run_planned_operation( increment_broker_port_config, lambda: kill_task_with_pattern('kafka.scheduler.Main', host)) check_health()
def test_config_update_eventually_succeeds_after_zk_fails(): run_planned_operation( increment_broker_port_config, lambda: kill_task_with_pattern('zookeeper') ) check_health()
def test_upgrade_downgrade(): # Ensure both Universe and the test repo exist. @mgummelt if len(shakedown.get_package_repos()['repositories']) != 2: print('No kafka test repo found. Skipping test_upgrade_downgrade') return test_repo_name, test_repo_url = get_test_repo_info() test_version = get_pkg_version() print('Found test version: {}'.format(test_version)) remove_repo(test_repo_name, test_version) master_version = get_pkg_version() print('Found master version: {}'.format(master_version)) print('Installing master version') install(package_version=master_version) check_health() plan = get_plan(lambda p: p['status'] == 'COMPLETE') assert plan['status'] == 'COMPLETE' topics_are_available() write_messages() print('Upgrading to test version') destroy_service() add_repo(test_repo_name, test_repo_url, master_version) install(package_version=test_version) check_post_version_change_health() print('Downgrading to master version') destroy_service() install(package_version=master_version) check_post_version_change_health()
def test_deploy(): # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least 30s') end_time = time.time() + 30 # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 while time.time() < end_time: try: tasks = shakedown.get_service_tasks(PACKAGE_NAME) except Exception as e: continue states = [t['state'] for t in tasks] print('Task states: {}'.format(states)) if TASK_RUNNING_STATE in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 time.sleep(1) # add the needed envvars in marathon and confirm that the deployment succeeds: config = get_marathon_config() env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' request(dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) check_health()
def test_config_update_eventually_succeeds_after_all_brokers_fail(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( increment_broker_port_config, lambda: [kill_task_with_pattern('kafka.Kafka', h) for h in hosts] ) check_health()
def test_config_update_eventually_succeeds_after_scheduler_fails(): host = get_scheduler_host() run_planned_operation( increment_broker_port_config, lambda: kill_task_with_pattern('kafka.scheduler.Main', host) ) check_health()
def test_service_becomes_healthy_after_all_agents_are_partitioned(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host) check_health()
def test_can_adjust_config_from_dynamic_to_dynamic_port(): check_health() connections = get_connection_info()['address'] config = get_kafka_config() brokerCpus = int(config['env']['BROKER_CPUS']) config['env']['BROKER_CPUS'] = str(brokerCpus + 0.1) update_kafka_config(config) check_health()
def test_can_adjust_config_from_dynamic_to_dynamic_port(): check_health() connections = get_connection_info()['address'] config = get_kafka_config() brokerCpus = int(config['env']['BROKER_CPUS']) config['env']['BROKER_CPUS'] = str(brokerCpus + 0.1) r = request(dcos.http.put, marathon_api_url('apps/kafka'), json=config) check_health()
def test_config_update_eventually_succeeds_after_agent_is_partitioned(): host = get_broker_host() def partition(): spin(shakedown.partition_agent, lambda x: (True, ''), host) shakedown.reconnect_agent(host) run_planned_operation(increment_broker_port_config, partition) check_health()
def test_config_update_eventually_succeeds_after_all_agents_are_partitioned(): hosts = shakedown.get_service_ips(PACKAGE_NAME) def partition(): for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host) run_planned_operation(increment_broker_port_config, partition) check_health()
def test_can_adjust_config_from_dynamic_to_dynamic_port(): check_health() connections = get_connection_info()['address'] config = get_kafka_config() config['env']['KAFKA_VER_NAME'] = 'kafka-nonce-ver' r = request(dcos.http.put, marathon_api_url('apps/kafka'), json=config) check_health() result = get_connection_info() assert (set([a.split(':')[-1] for a in result['address'] ]) == set([a.split(':')[-1] for a in connections]))
def test_bump_hello_cpus(): check_health() hello_ids = get_task_ids('hello') print('hello ids: ' + str(hello_ids)) config = get_marathon_config() cpus = float(config['env']['HELLO_CPUS']) config['env']['HELLO_CPUS'] = str(cpus + 0.1) request(dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) tasks_updated('hello', hello_ids) check_health()
def test_can_adjust_config_from_static_to_dynamic_port(): check_health() config = get_kafka_config() config['env']['BROKER_PORT'] = '0' update_kafka_config(config) check_health() result = get_connection_info() assert len(result['address']) == 3 for hostport in result['address']: assert 9092 != int(hostport.split(':')[-1])
def setup_module(module): uninstall() if strict_mode == 'strict': shakedown.install_package_and_wait( package_name=PACKAGE_NAME, options_file=os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/strict.json") else: shakedown.install_package_and_wait(package_name=PACKAGE_NAME, options_file=None) check_health()
def test_can_adjust_config_from_static_to_dynamic_port(): check_health() config = get_kafka_config() config['env']['BROKER_PORT'] = '0' r = request(dcos.http.put, marathon_api_url('apps/kafka'), json=config) check_health() result = get_connection_info() assert len(result['address']) == 3 for hostport in result['address']: assert 9092 != int(hostport.split(':')[-1])
def test_bump_hello_nodes(): check_health() hello_ids = get_task_ids('hello') print('hello ids: ' + str(hello_ids)) config = get_marathon_config() nodeCount = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(nodeCount) request(dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) check_health() tasks_not_updated('hello', hello_ids)
def test_upgrade(): test_version = get_pkg_version() print('Found test version: {}'.format(test_version)) add_repo(test_version) master_version = get_pkg_version() print('Found master version: {}'.format(master_version)) print('Installing master version') install(master_version) check_health() print('Upgrading to test version') destroy_service() install(test_version) check_health() # clean up remove_repo(master_version)
def test_bump_data_nodes(): check_health() data_ids = get_task_ids('data') print('data ids: ' + str(data_ids)) config = get_marathon_config() nodeCount = int(config['env']['DATA_COUNT']) + 1 config['env']['DATA_COUNT'] = str(nodeCount) r = request( dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) check_health(DEFAULT_HDFS_TASK_COUNT + 1) tasks_not_updated('data', data_ids)
def test_integrity_on_data_node_failure(): shakedown.wait_for( lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_1_NAME), HDFS_CMD_TIMEOUT_SEC) # gives chance for write to succeed and replication to occur time.sleep(5) kill_task_with_pattern("DataNode", 'data-0-node.hdfs.mesos') kill_task_with_pattern("DataNode", 'data-1-node.hdfs.mesos') time.sleep(1) # give DataNode a chance to die shakedown.wait_for( lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_1_NAME ), HDFS_CMD_TIMEOUT_SEC) check_health()
def test_integrity_on_name_node_failure(): """ The first name node (name-0-node) is the active name node by default when HDFS gets installed. This test checks that it is possible to write and read data after the first name node fails. """ kill_task_with_pattern("NameNode", 'name-0-node.hdfs.mesos') time.sleep(1) # give NameNode a chance to die shakedown.wait_for( lambda: write_data_to_hdfs("data-0-node.hdfs.mesos", TEST_FILE_2_NAME), HDFS_CMD_TIMEOUT_SEC) shakedown.wait_for( lambda: read_data_from_hdfs("data-2-node.hdfs.mesos", TEST_FILE_2_NAME ), HDFS_CMD_TIMEOUT_SEC) check_health()
def test_upgrade(): test_repo_name, test_repo_url = get_test_repo_info() test_version = get_pkg_version() print('Found test version: {}'.format(test_version)) remove_repo(test_repo_name, test_version) master_version = get_pkg_version() print('Found master version: {}'.format(master_version)) print('Installing master version') install({'package_version': master_version}) check_health() write_messages() print('Upgrading to test version') destroy_service() add_repo(test_repo_name, test_repo_url, master_version) install({'package_version': test_version}) check_post_upgrade_health()
def test_bump_journal_cpus(): check_health() journal_ids = get_task_ids('journal') print('journal ids: ' + str(journal_ids)) config = get_marathon_config() print('marathon config: ') print(config) cpus = float(config['env']['JOURNAL_CPUS']) config['env']['JOURNAL_CPUS'] = str(cpus + 0.1) r = request( dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) tasks_updated('journal', journal_ids) check_health()
def test_bump_metadata_cpus(): check_health() meta_data_ids = get_task_ids('meta-data') print('meta-data ids: ' + str(meta_data_ids)) data_ids = get_task_ids('data') print('data ids: ' + str(data_ids)) config = get_marathon_config() cpus = float(config['env']['METADATA_CPU']) config['env']['METADATA_CPU'] = str(cpus + 0.1) r = request(dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) tasks_updated('meta-data', meta_data_ids) tasks_not_updated('data', data_ids) check_health()
def test_bump_world_nodes(): check_health() hello_ids = get_task_ids('hello') print('hello ids: ' + str(hello_ids)) world_ids = get_task_ids('world') print('world ids: ' + str(world_ids)) config = get_marathon_config() worldNodeCount = int(config['env']['WORLD_COUNT']) + 1 config['env']['WORLD_COUNT'] = str(worldNodeCount) r = request(dcos.http.put, marathon_api_url('apps/' + PACKAGE_NAME), json=config) check_health(DEFAULT_TASK_COUNT + 1) tasks_not_updated('hello', hello_ids) tasks_not_updated('world', world_ids)
def test_pods_replace(): world_ids = get_task_ids('world-0') # get current agent id: stdout = run_dcos_cli_cmd('hello-world pods info world-0') old_agent = json.loads(stdout)[0]['info']['slaveId']['value'] jsonobj = json.loads(run_dcos_cli_cmd('hello-world pods replace world-0')) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' tasks_updated('world-0', world_ids) check_health() # check agent moved: stdout = run_dcos_cli_cmd('hello-world pods info world-0') new_agent = json.loads(stdout)[0]['info']['slaveId']['value']
def test_marathon_rack_not_found(): # install without waiting, since the install should never succeed and a timeout would result in an # assertion failure install(additional_options = {'service':{'placement_constraint':'rack_id:LIKE:rack-foo-.*'}}, wait=False) try: check_health() assert False, "Should have failed healthcheck" except: pass # expected to fail, just wanting to wait plan = get_plan() # check that first node is still (unsuccessfully) looking for a match: # reconciliation complete assert plan['status'] == 'IN_PROGRESS' # phase is pending assert plan['phases'][1]['status'] == 'PENDING' # step is pending assert plan['phases'][1]['steps'][0]['status'] == 'PENDING' uninstall()
def test_pods_restart(): hello_ids = get_task_ids('hello-0') # get current agent id: stdout = run_dcos_cli_cmd('hello-world pods info hello-0') old_agent = json.loads(stdout)[0]['info']['slaveId']['value'] stdout = run_dcos_cli_cmd('hello-world pods restart hello-0') jsonobj = json.loads(stdout) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'hello-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'hello-0-server' tasks_updated('hello', hello_ids) check_health() # check agent didn't move: stdout = run_dcos_cli_cmd('hello-world pods info hello-0') new_agent = json.loads(stdout)[0]['info']['slaveId']['value'] assert old_agent == new_agent
def test_upgrade_downgrade(): # Ensure both Universe and the test repo exist. # In particular, the Framework Test Suite only runs packages from Universe; # it doesn't add a test repo like the PR jobs. if len(shakedown.get_package_repos()['repositories']) != 2: print('No test repo found. Skipping test_upgrade_downgrade') return test_repo_name, test_repo_url = get_test_repo_info() test_version = get_pkg_version() print('Found test version: {}'.format(test_version)) remove_repo(test_repo_name, test_version) master_version = get_pkg_version() print('Found master version: {}'.format(master_version)) print('Installing master version') install(master_version) check_health() print('Upgrading to test version') destroy_service() add_repo(test_repo_name, test_repo_url, prev_version=master_version) install(test_version) check_health() print('Downgrading to master version') destroy_service() install(master_version) check_health()
def test_static_port_comes_online(static_port_config): check_health()
def test_service_becomes_healthy_after_master_fails(): kill_task_with_pattern('mesos-master') check_health()
def test_service_becomes_healthy_after_zk_fails(): kill_task_with_pattern('zookeeper') check_health()
def test_service_becomes_healthy_after_scheduler_fails(): kill_task_with_pattern('kafka.scheduler.Main', get_scheduler_host()) check_health()
def test_service_becomes_healthy_after_all_brokers_fail(): for host in shakedown.get_service_ips(PACKAGE_NAME): kill_task_with_pattern('kafka.Kafka', host) check_health()
def setup_module(module): uninstall() install(DYNAMIC_PORT_OPTIONS_DICT) check_health()
def test_dynamic_port_comes_online(dynamic_port_config): check_health()
def test_service_becomes_healthy_after_broker_fails(): kill_task_with_pattern('kafka.Kafka', get_broker_host()) check_health()