def test_service_becomes_healthy_after_agent_is_partitioned(): host = get_broker_host() spin(shakedown.partition_agent, lambda x: (True, ''), host) shakedown.reconnect_agent(host) check_health()
def test_lock(): '''This test verifies that a second scheduler fails to startup when an existing scheduler is running. Without locking, the scheduler would fail during registration, but after writing its config to ZK. So in order to verify that the scheduler fails immediately, we ensure that the ZK config state is unmodified.''' marathon_client = dcos.marathon.create_client() # Get ZK state from running framework zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME) zk_config_old = shakedown.get_zk_node_data(zk_path) # Get marathon app app_id = "/{}".format(PACKAGE_NAME) app = marathon_client.get_app(app_id) old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None) # Scale to 2 instances labels = app["labels"] labels.pop("MARATHON_SINGLE_INSTANCE_APP") marathon_client.update_app(app_id, {"labels": labels}) shakedown.deployment_wait() marathon_client.update_app(app_id, {"instances": 2}) # Wait for second scheduler to fail fn = lambda: marathon_client.get_app(app_id).get("lastTaskFailure", {} ).get("timestamp", None) success = lambda timestamp: (timestamp != old_timestamp, "second scheduler has not yet failed") spin(fn, success) # Verify ZK is unchanged zk_config_new = shakedown.get_zk_node_data(zk_path) assert zk_config_old == zk_config_new
def write_messages(): # kafka may not be ready to accept all msgs, try till all are done def fn(num): try: offset_info = get_kafka_command( 'topic offsets {}'.format(TOPIC_NAME)) offset = int(offset_info[0]['0']) if offset < num: get_kafka_command('topic producer_test {} {}'.format( TOPIC_NAME, num - offset)) assert (num - offset) >= 0 return num - offset except RuntimeError: return num def success_predicate(left_offset): return (left_offset <= 0, 'producer_test continues....') get_kafka_command('topic producer_test {} {}'.format( TOPIC_NAME, NUM_TEST_MSGS)) spin(fn, success_predicate, NUM_TEST_MSGS) print('producer_test is successful {} msg available'.format(NUM_TEST_MSGS)) # double check check_offsets()
def new_default_version_available(prev_version): def fn(): get_pkg_version() def success_predicate(pkg_version): return (pkg_version != prev_version, 'Package version has not changed') spin(fn, success_predicate)
def test_service_becomes_healthy_after_all_agents_are_partitioned(): hosts = shakedown.get_service_ips(PACKAGE_NAME) for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host) check_health()
def topics_are_available(): def fn(): try: get_kafka_command('topic list') return True except RuntimeError: return False def success_predicate(topics_available): return (topics_available, 'Topics are not available') spin(fn, success_predicate)
def destroy_service(): destroy_endpoint = marathon_api_url_with_param('apps', PACKAGE_NAME) request(dcos.http.delete, destroy_endpoint) # Make sure the scheduler has been destroyed def fn(): shakedown.get_service(PACKAGE_NAME) def success_predicate(service): return (service == None, 'Service not destroyed') spin(fn, success_predicate)
def check_scheduler_health(): # Make sure scheduler endpoint is responding and all brokers are available def fn(): try: return get_kafka_command('broker list') except RuntimeError: return [] def success_predicate(brokers): return (len(brokers) == DEFAULT_BROKER_COUNT, 'Scheduler and all brokers not available') spin(fn, success_predicate)
def test_is_suppressed(): dcos_url = dcos.config.get_config_val('core.dcos_url') suppressed_url = urllib.parse.urljoin( dcos_url, 'service/kafka/v1/state/properties/suppressed') def suppress_url_check(): response = dcos.http.get(suppressed_url) response.raise_for_status() return response.text def success_predicate(result): return (result == "true", 'Waiting for supressed') spin(suppress_url_check, success_predicate)
def check_offsets(): topics_are_available() # Keep trying to read the offsets until the kafka command succeeds def fn(): try: offset_info = get_kafka_command( 'topic offsets {}'.format(TOPIC_NAME)) offset = int(offset_info[0]['0']) assert offset == NUM_TEST_MSGS return True except RuntimeError: return False def success_predicate(got_offset): return (got_offset, 'Unable to get offset') spin(fn, success_predicate)
def task_id_changes(broker_name, task_id): def fn(): try: tasks = shakedown.get_service_tasks(PACKAGE_NAME) return [t for t in tasks if t['state'] == TASK_RUNNING_STATE and t['name'] == broker_name] except dcos.errors.DCOSHTTPException: return [] def success_predicate(tasks): return (len(tasks) == 1 and tasks[0]['id'] != task_id, "Task ID didn't change.") return spin(fn, success_predicate)
def get_running_broker_task(broker_name): def fn(): try: tasks = shakedown.get_service_tasks(PACKAGE_NAME) return [t for t in tasks if t['state'] == TASK_RUNNING_STATE and t['name'] == broker_name] except dcos.errors.DCOSHTTPException: return [] def success_predicate(tasks): return (len(tasks) == 1, 'Failed to get task') return spin(fn, success_predicate)
def wait_for_deployment_lock_release(): def fn(): return dcos.http.get(marathon_api_url('deployments')) def pred(result): try: return (result.status_code == 200 and result.json() == [], 'Deployment was not unlocked') except json.decoder.JSONDecodeError as e: return False, 'Deployment was not unlocked' return spin(fn, pred)
def test_failing_health_check(static_port_config): broker_id = '0' broker_name = 'broker-' + broker_id def found_broker(result): return result != None, 'Broker not found.' def broker_killed_result_checker(result): return result, 'Broker not killed.' print('Waiting for last Running Broker.') test_utils.spin(get_running_broker_task_id, found_broker, 'broker-2') # Get broker-0's task ID so we can know when it kills itself after failing # the health check. task_id = get_running_broker_task_id(broker_name) print("{}'s task_id is {}".format(broker_name, task_id)) # Delete the ZK node which should trigger the health check to kill broker-0 shakedown.run_command_on_master( 'wget https://github.com/outbrain/zookeepercli/releases/' 'download/v1.0.10/zookeepercli') shakedown.run_command_on_master('sudo chmod +x zookeepercli') shakedown.run_command_on_master( './zookeepercli --servers 127.0.0.1 -c delete ' '/dcos-service-kafka/brokers/ids/' + broker_id) print('Waiting for Broker to fail.') test_utils.spin(broker_killed, broker_killed_result_checker, task_id) print('Waiting for Running Broker.') test_utils.spin(get_running_broker_task_id, found_broker, broker_name)
def test_failing_health_check(static_port_config): broker_id = '0' broker_name = 'broker-' + broker_id def found_broker(result): return result != None, 'Broker not found.' def broker_killed_result_checker(result): return result, 'Broker not killed.' print('Waiting for last Running Broker.') test_utils.spin(get_running_broker_task_id, found_broker, 'broker-2') # Get broker-0's task ID so we can know when it kills itself after failing # the health check. task_id = get_running_broker_task_id(broker_name) print("{}'s task_id is {}".format(broker_name, task_id)) # Delete the ZK node which should trigger the health check to kill broker-0 shakedown.run_command_on_master( 'wget https://github.com/outbrain/zookeepercli/releases/' 'download/v1.0.10/zookeepercli' ) shakedown.run_command_on_master('sudo chmod +x zookeepercli') shakedown.run_command_on_master( './zookeepercli --servers 127.0.0.1 -c delete ' '/dcos-service-kafka/brokers/ids/' + broker_id ) print('Waiting for Broker to fail.') test_utils.spin(broker_killed, broker_killed_result_checker, task_id) print('Waiting for Running Broker.') test_utils.spin(get_running_broker_task_id, found_broker, broker_name)
def wait_for_deployment_lock_release(): def fn(): return dcos.http.get(marathon_api_url('deployments')) def pred(result): try: return ( result.status_code == 200 and result.json() == [], 'Deployment was not unlocked' ) except json.decoder.JSONDecodeError as e: return False, 'Deployment was not unlocked' return spin(fn, pred)
def test_scheduler_connection_setup_is_correct(): def fn(): return get_kafka_command('connection') def pred(result): return (len(result['address']) == DEFAULT_BROKER_COUNT, 'Expected number of brokers never came online') connection_info = spin(fn, pred) assert len(connection_info) == 4 assert len(connection_info['dns']) == DEFAULT_BROKER_COUNT assert connection_info['zookeeper'] == ( 'master.mesos:2181/dcos-service-{}'.format(PACKAGE_NAME))
def get_and_verify_plan(predicate=lambda r: True): def fn(): return dcos.http.get(kafka_api_url('plan')) def success_predicate(result): message = 'Request to /plan failed' try: body = result.json() except: return False, message return predicate(body), message return spin(fn, success_predicate).json()
def destroy_service(): destroy_endpoint = marathon_api_url_with_param('apps', PACKAGE_NAME) # Keep trying until marathon request succeeds def fn(): try: request(dcos.http.delete, destroy_endpoint) return True except dcos.errors.DCOSHTTPException: return False def success_predicate(success): return (success, 'Destroy request failed') spin(fn, success_predicate) # Make sure the scheduler has been destroyed def fn(): shakedown.get_service(PACKAGE_NAME) def success_predicate(service): return (service == None, 'Service not destroyed') spin(fn, success_predicate)
def test_scheduler_connection_setup_is_correct(): def fn(): return get_kafka_command('connection') def pred(result): return ( len(result['address']) == DEFAULT_BROKER_COUNT, 'Expected number of brokers never came online' ) connection_info = spin(fn, pred) assert len(connection_info) == 4 assert len(connection_info['dns']) == DEFAULT_BROKER_COUNT assert connection_info['zookeeper'] == ( 'master.mesos:2181/dcos-service-{}'.format(PACKAGE_NAME) )
def get_connection_info(): def fn(): return shakedown.run_dcos_command('kafka connection') def success_predicate(result): deployments = dcos.http.get(marathon_api_url('deployments')).json() if deployments: return False, 'Deployment is ongoing' result, error = result try: result = json.loads(result) except Exception: return False, 'Command did not return JSON' else: return ( not error and len(result['address']) == 3, 'Command errored or expected number of brokers are not up', ) return json.loads(spin(fn, success_predicate)[0])
def tasks_updated(prefix, old_task_ids): def fn(): try: return get_task_ids(prefix) except dcos.errors.DCOSHTTPException: return [] def success_predicate(task_ids): print('Old task ids: ' + str(old_task_ids)) print('New task ids: ' + str(task_ids)) success = True for id in task_ids: print('Checking ' + id) if id in old_task_ids: success = False if not len(task_ids) >= len(old_task_ids): success = False print('Waiting for update to ' + prefix) return (success, 'Task type:' + prefix + ' not updated') return spin(fn, success_predicate)
def partition(): for host in hosts: spin(shakedown.partition_agent, lambda x: (True, ''), host) for host in hosts: shakedown.reconnect_agent(host)