def get_tasks_avoiding_scheduler(service_name, task_name_pattern): '''Returns a list of tasks which are not located on the Scheduler's machine. Avoid also killing the system that the scheduler is on. This is just to speed up testing. In practice, the scheduler would eventually get relaunched on a different node by Marathon and we'd be able to proceed with repairing the service from there. However, it takes 5-20 minutes for Mesos to decide that the agent is dead. This is also why we perform a manual 'ls' check to verify the host is down, rather than waiting for Mesos to tell us. ''' skip_tasks = {sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME} server_tasks = [ task for task in get_summary() if task.name not in skip_tasks and task_name_pattern.match(task.name) ] scheduler_ip = shakedown.get_service_ips('marathon', service_name).pop() log.info('Scheduler IP: {}'.format(scheduler_ip)) # Always avoid package registry (if present) registry_ips = shakedown.get_service_ips( 'marathon', sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME) log.info('Package Registry [{}] IP(s): {}'.format( sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME, registry_ips)) skip_ips = {scheduler_ip} | set(registry_ips) avoid_tasks = [task for task in server_tasks if task.host not in skip_ips] log.info('Found tasks avoiding scheduler and {} at {}: {}'.format( sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME, skip_ips, avoid_tasks)) return avoid_tasks
def test_cleanup_then_all_executors_killed(install_framework): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup(), lambda: [kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts]) check_health()
def test_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: kill_task_with_pattern('CassandraDaemon', host) recover_failed_agents(hosts) check_health()
def test_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: kill_task_with_pattern('cassandra.executor.Main', host) recover_failed_agents(hosts) check_health()
def get_pod_to_replace(): '''Avoid also killing the system that the scheduler is on. This is just to speed up testing. In practice, the scheduler would eventually get relaunched on a different node by Marathon and we'd be able to proceed with repairing the service from there. However, it takes 5-20 minutes for Mesos to decide that the agent is dead. This is also why we perform a manual 'ls' check to verify the host is down, rather than waiting for Mesos to tell us. ''' scheduler_ip = shakedown.get_service_ips('marathon', config.SERVICE_NAME).pop() log.info('Scheduler IP: {}'.format(scheduler_ip)) pods = {} for pod_id in range(0, config.DEFAULT_TASK_COUNT): pod_name = 'node-{}'.format(pod_id) pods[pod_name] = { 'name': pod_name, 'host': get_pod_host(pod_name), 'agent': get_pod_agent(pod_name) } log.info('Pods:\n{}'.format(pprint.pformat(pods))) replace_pod = None for key, value in pods.items(): if value['host'] != scheduler_ip: replace_pod = value log.info('Found pod avoiding scheduler at {}: {}'.format( scheduler_ip, value)) break return replace_pod
def test_config_update_eventually_succeeds_after_all_brokers_fail(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( increment_broker_port_config, lambda: [kill_task_with_pattern('kafka.Kafka', h) for h in hosts]) check_health()
def test_repair_then_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_repair, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts]) check_health()
def test_data_survives_crash(): # Generate SQL Commands cmd_drop_database = cockroach_cmd('DROP DATABASE IF EXISTS bank;') cmd_create_database = cockroach_cmd('CREATE DATABASE bank;') cmd_create_table = cockroach_cmd( 'CREATE TABLE accounts (id INT PRIMARY KEY, balance INT);', 'bank') cmd_insert = cockroach_cmd( 'INSERT INTO accounts (id, balance) VALUES (1, 1000), (2, 250);', 'bank') cmd_select = cockroach_cmd('SELECT id, balance FROM accounts;', 'bank') # Run SQL Commands (except cmd_select) cmd.run_cli(cmd_drop_database) out_create_database = cmd.run_cli(cmd_create_database) out_create_table = cmd.run_cli(cmd_create_table) out_insert = cmd.run_cli(cmd_insert) # Kill All CockroachDB Nodes (one at a time) service_ips = shakedown.get_service_ips(SERVICE_NAME) for service_ip in service_ips: shakedown.kill_process_on_host( service_ip, "cockroach start") # Kill CockroachDB node tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT, 5 * 60) # Wait for new CockroachDB node to run shakedown.wait_for(lambda: cockroach_nodes_healthy(), noisy=True, timeout_seconds=5 * 60) # Wait for healthy CockroachDB cluster # Run cmd_select out_select = cmd.run_cli(cmd_select) # Confirm Output assert '2 rows' in out_select
def test_all_partition(): hosts = shakedown.get_service_ips(config.SERVICE_NAME) for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) config.check_running()
def test_cleanup_then_kill_all_task_in_node(install_framework): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts]) check_health()
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_tasks.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( bump_cpu_count_config, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts]) check_health()
def test_repair_then_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_repair, lambda: [kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts]) check_health()
def test_config_update_then_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( bump_cpu_count_config, lambda: [kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts]) check_health()
def test_shutdown_host_test(): scheduler_ip = shakedown.get_service_ips('marathon', PACKAGE_NAME).pop() sdk_utils.out('marathon ip = {}'.format(scheduler_ip)) node_ip = None for pod_id in range(0, DEFAULT_TASK_COUNT): pod_host = get_pod_host(pod_id) if pod_host != scheduler_ip: node_ip = pod_host break assert node_ip is not None, 'Could not find a node to shut down' old_agent = get_pod_agent(pod_id) sdk_utils.out('pod id = {}, node_ip = {}, agent = {}'.format( pod_id, node_ip, old_agent)) task_ids = tasks.get_task_ids(PACKAGE_NAME, 'node-{}'.format(pod_id)) # instead of partitioning or reconnecting, we shut down the host permanently status, stdout = shakedown.run_command_on_agent(node_ip, 'sudo shutdown -h +1') sdk_utils.out('shutdown agent {}: [{}] {}'.format(node_ip, status, stdout)) assert status is True time.sleep(100) cmd.run_cli('cassandra pods replace node-{}'.format(pod_id)) tasks.check_tasks_updated(PACKAGE_NAME, 'node', task_ids) # double check that all tasks are running tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT) new_agent = get_pod_agent(pod_id) assert old_agent != new_agent
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_cleanup_then_kill_all_task_in_node(install_framework): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts] ) check_health()
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_eventually_succeeds_after_all_brokers_fail(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( increment_broker_port_config, lambda: [kill_task_with_pattern('kafka.Kafka', h) for h in hosts] ) check_health()
def test_config_update_then_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( bump_cpu_count_config, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts] ) check_health()
def test_repair_then_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_repair, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts] ) check_health()
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() hosts = shakedown.get_service_ips(PACKAGE_NAME) [tasks.kill_task_with_pattern('world', h) for h in hosts] tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_cleanup_then_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup, lambda: [kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts], lambda: recover_failed_agents(hosts)) check_health()
def test_cleanup_then_kill_all_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup, lambda: [kill_task_with_pattern('CassandraDaemon', h) for h in hosts], lambda: recover_failed_agents(hosts)) check_health()
def test_all_partition(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) check_health()
def test_service_becomes_healthy_after_all_agents_are_partitioned(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host) check_health()
def test_repair_then_kill_task_in_node(): hosts = shakedown.get_service_ips(PACKAGE_NAME) host = get_node_host() run_planned_operation( run_repair, lambda: kill_task_with_pattern('CassandraDaemon', host), lambda: recover_failed_agents(hosts)) check_health()
def get_scheduler_host(service_name): # Marathon mangles foldered paths as follows: "/path/to/svc" => "svc.to.path" task_name_elems = service_name.lstrip('/').split('/') task_name_elems.reverse() app_name = '.'.join(task_name_elems) ips = shakedown.get_service_ips('marathon', app_name) if len(ips) == 0: raise Exception('No IPs found for marathon task "{}". Available tasks are: {}'.format( app_name, [task['name'] for task in shakedown.get_service_tasks('marathon')])) return ips.pop()
def test_cleanup_then_all_executors_killed(install_framework): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_cleanup(), lambda: [ kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts ] ) check_health()
def test_repair_then_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( run_repair, lambda: [ kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts ] ) check_health()
def test_config_update_then_all_executors_killed(): hosts = shakedown.get_service_ips(PACKAGE_NAME) run_planned_operation( bump_cpu_count_config, lambda: [ kill_task_with_pattern('cassandra.executor.Main', h) for h in hosts ] ) check_health()
def test_kill_all_journalnodes(): journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') for host in shakedown.get_service_ips(PACKAGE_NAME): tasks.kill_task_with_pattern('journalnode', host) check_healthy() # name nodes fail and restart, so don't check those tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_repair_then_all_partition(): hosts = shakedown.get_service_ips(PACKAGE_NAME) def partition(): for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) run_planned_operation(run_repair, partition) check_health()
def test_config_update_then_all_partition(): hosts = shakedown.get_service_ips(PACKAGE_NAME) def partition(): for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) run_planned_operation(bump_cpu_count_config, partition) check_health()
def test_kill_all_datanodes(): journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') for host in shakedown.get_service_ips(PACKAGE_NAME): tasks.kill_task_with_pattern('datanode', host) check_healthy() tasks.check_tasks_updated(PACKAGE_NAME, 'data', data_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'name', name_ids)
def test_cleanup_then_all_partition(install_framework): hosts = shakedown.get_service_ips(PACKAGE_NAME) def partition(): for host in hosts: shakedown.partition_agent(host) for host in hosts: shakedown.reconnect_agent(host) run_planned_operation(run_cleanup, partition) check_health()
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for host in shakedown.get_service_ips(FOLDERED_SERVICE_NAME): sdk_tasks.kill_task_with_pattern('journalnode', host) expect_recovery() # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_config_update_eventually_succeeds_after_all_agents_are_partitioned(): hosts = shakedown.get_service_ips(PACKAGE_NAME) def partition(): for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host) run_planned_operation(increment_broker_port_config, partition) check_health()
def get_tasks_avoiding_scheduler(service_name, task_name_pattern): '''Returns a list of tasks which are not located on the Scheduler's machine. Avoid also killing the system that the scheduler is on. This is just to speed up testing. In practice, the scheduler would eventually get relaunched on a different node by Marathon and we'd be able to proceed with repairing the service from there. However, it takes 5-20 minutes for Mesos to decide that the agent is dead. This is also why we perform a manual 'ls' check to verify the host is down, rather than waiting for Mesos to tell us. ''' scheduler_ip = shakedown.get_service_ips('marathon', service_name).pop() log.info('Scheduler IP: {}'.format(scheduler_ip)) server_tasks = [ task for task in get_summary() if task_name_pattern.match(task.name)] avoid_tasks = [task for task in server_tasks if task.host != scheduler_ip] log.info('Found tasks avoiding scheduler at {}: {}'.format(scheduler_ip, avoid_tasks)) return avoid_tasks
def test_shutdown_host_test(): scheduler_ip = shakedown.get_service_ips('marathon', config.SERVICE_NAME).pop() log.info('marathon ip = {}'.format(scheduler_ip)) node_ip = None pod_name = None for pod_id in range(0, config.DEFAULT_TASK_COUNT): pod_name = 'node-{}'.format(pod_id) pod_host = get_pod_host(pod_name) if pod_host != scheduler_ip: node_ip = pod_host break assert node_ip is not None, 'Could not find a node to shut down' old_agent = get_pod_agent(pod_name) log.info('pod name = {}, node_ip = {}, agent = {}'.format(pod_name, node_ip, old_agent)) task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, pod_name) # instead of partitioning or reconnecting, we shut down the host permanently status, stdout = shakedown.run_command_on_agent(node_ip, 'sudo shutdown -h +1') log.info('shutdown agent {}: [{}] {}'.format(node_ip, status, stdout)) assert status is True log.info('sleeping 100s after shutting down agent') time.sleep(100) cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_name)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, pod_name, task_ids) # double check that all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) new_agent = get_pod_agent(pod_name) assert old_agent != new_agent
def test_kill_scheduler(): sdk_cmd.kill_task_with_pattern('hdfs.scheduler.Main', shakedown.get_service_ips('marathon').pop()) config.check_healthy(service_name=sdk_utils.get_foldered_name(config.SERVICE_NAME))
def test_service_becomes_healthy_after_all_brokers_fail(): for host in shakedown.get_service_ips(PACKAGE_NAME): kill_task_with_pattern('kafka.Kafka', host) check_health()
def get_broker_host(): return shakedown.get_service_ips(PACKAGE_NAME).pop()
def test_all_executors_killed(): for host in shakedown.get_service_ips(config.SERVICE_NAME): sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', host) config.check_running()
def get_marathon_host(): return shakedown.get_service_ips('marathon', PACKAGE_NAME).pop()
def test_kill_scheduler(): sdk_tasks.kill_task_with_pattern('hdfs.scheduler.Main', shakedown.get_service_ips('marathon').pop()) config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
def test_kill_all_task_in_node(): for host in shakedown.get_service_ips(PACKAGE_NAME): kill_task_with_pattern('CassandraDaemon', host) check_health()
def ip_of_mom(): service_ips = shakedown.get_service_ips('marathon', 'marathon-user') for mom_ip in service_ips: return mom_ip
def fn(): try: return shakedown.get_service_ips(PACKAGE_NAME) except IndexError: return set()
def test_all_executors_killed(): for host in shakedown.get_service_ips(PACKAGE_NAME): kill_task_with_pattern('cassandra.executor.Main', host) check_health()
def get_scheduler_host(): return shakedown.get_service_ips('marathon').pop()