def test_config_update_then_scheduler_died(): world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') host = marathon.get_scheduler_host(PACKAGE_NAME) bump_world_cpus() tasks.kill_task_with_pattern('helloworld.scheduler.Main', host) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_config_update_then_scheduler_died(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME) config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_unchanged_scheduler_restarts_without_restarting_tasks(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_task_ids = sdk_tasks.get_task_ids(foldered_name, '') shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(foldered_name), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(foldered_name, '', initial_task_ids)
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master") shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(FOLDERED_SERVICE_NAME), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, "master", initial_task_ids)
def test_kill_scheduler(): scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix( config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) check_healthy()
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(foldered_name, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hdfs-scheduler/bin/hdfs", "nobody", agent_host=sdk_marathon.get_scheduler_host(foldered_name), ) # scheduler should be restarted, but service tasks should be left as-is: sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_healthy(service_name=foldered_name)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find ONLY one scheduler task but found {}".format(scheduler_ids) sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(config.SERVICE_NAME) config.check_running() sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(foldered_name, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hdfs-scheduler/bin/hdfs", "nobody", agent_host=sdk_marathon.get_scheduler_host(foldered_name), ) # scheduler should be restarted, but service tasks should be left as-is: sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_healthy(service_name=foldered_name)
def test_dispatcher_placement(configure_universe): constraint = [ "hostname", "CLUSTER", sdk_agents.get_private_agents().pop()["hostname"] ] service_name = "spark" log.info("Running test: service_name='{}', constraints=[[{}]]".format( service_name, ','.join(constraint))) options = {"service": {"name": service_name, "constraints": [constraint]}} try: utils.require_spark(service_name=service_name, additional_options=options) dispatcher_host = sdk_marathon.get_scheduler_host(service_name) log.info("Dispatcher Host: {}".format(dispatcher_host)) assert constraint[2] == dispatcher_host finally: utils.teardown_spark(service_name=service_name)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix( config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len( scheduler_ids ) == 1, "Expected to find ONLY one scheduler task but found {}".format( scheduler_ids) sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(config.SERVICE_NAME) config.check_running() sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
def test_scheduler_died(): tasks.kill_task_with_pattern('helloworld.scheduler.Main', marathon.get_scheduler_host(PACKAGE_NAME)) check_running()
def test_scheduler_died(): sdk_cmd.kill_task_with_pattern( 'helloworld.scheduler.Main', sdk_marathon.get_scheduler_host(config.SERVICE_NAME)) config.check_running()
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master") shakedown.kill_process_on_host(sdk_marathon.get_scheduler_host( FOLDERED_SERVICE_NAME), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated( FOLDERED_SERVICE_NAME, "master", initial_task_ids)
def test_scheduler_died(): sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', sdk_marathon.get_scheduler_host(config.SERVICE_NAME)) config.check_running()
def test_marathon_volume_collission(): # This test validates that a service registered in a sub-role of # slave_public will _not_ unreserve Marathon volumes RESERVED # in the `slave_public` role. # Uninstall HW first sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Install the marathon app marathon_app_name = "persistent-test" persistent_app = { "id": marathon_app_name, "mem": 128, "user": "******", "cmd": "echo 'this is a test' > persistent-volume/test && sleep 10000", "container": { "type": "MESOS", "volumes": [ { "persistent": { "type": "root", "size": 500, "constraints": [] }, "mode": "RW", "containerPath": "persistent-volume" } ] } } try: sdk_marathon.install_app(persistent_app) # Get its persistent Volume host = sdk_marathon.get_scheduler_host(marathon_app_name) ok, pv_name = sdk_cmd.agent_ssh(host, "ls /var/lib/mesos/slave/volumes/roles/slave_public") assert ok pv_name = pv_name.strip() @retrying.retry(wait_fixed=1000, stop_max_delay=60*1000) def check_content(): ok, pv_content = sdk_cmd.agent_ssh(host, "cat /var/lib/mesos/slave/volumes/roles/slave_public/{}/test".format(pv_name)) assert pv_content.strip() == "this is a test" check_content() # Scale down the Marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config['instances'] = 0 sdk_marathon.update_app(marathon_app_name, app_config) # Install Hello World sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options) # Make sure the persistent volume is still there check_content() # Uninstall Hello World sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Make sure the persistent volume is still there check_content() # Scale back up the marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config['instances'] = 1 sdk_marathon.update_app(marathon_app_name, app_config) # Make sure the persistent volume is still there check_content() finally: # Reinstall hello world sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options) sdk_marathon.destroy_app(marathon_app_name)
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = tasks.get_task_ids(PACKAGE_NAME, "master") shakedown.kill_process_on_host(marathon.get_scheduler_host(PACKAGE_NAME), "elastic.scheduler.Main") tasks.check_tasks_not_updated(PACKAGE_NAME, "master", initial_task_ids)
def test_marathon_volume_collision(): # This test validates that a service registered in a sub-role of # slave_public will _not_ unreserve Marathon volumes RESERVED # in the `slave_public` role. # Uninstall HW first sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Install the marathon app marathon_app_name = "persistent-test" volume_name = "persistent-volume" persistent_app = { "id": marathon_app_name, "mem": 128, "user": "******", "cmd": "echo 'this is a test' > {}/test && sleep 10000".format(volume_name), "container": { "type": "MESOS", "volumes": [{ "persistent": { "type": "root", "size": 500, "constraints": [] }, "mode": "RW", "containerPath": volume_name, }], }, } try: sdk_marathon.install_app(persistent_app) # Get its persistent Volume host = sdk_marathon.get_scheduler_host(marathon_app_name) # Should get e.g.: "/var/lib/mesos/slave/volumes/roles/slave_public/persistent-test#persistent-volume#76e7bb6d-64fa-11e8-abc5-8e679b292d5e" rc, pv_path, _ = sdk_cmd.agent_ssh( host, "ls -d /var/lib/mesos/slave/volumes/roles/slave_public/{}#{}#*". format(marathon_app_name, volume_name), ) if rc != 0: log.error("Could not get slave_public roles. return-code: '%s'\n", rc) assert rc == 0 pv_path = pv_path.strip() @retrying.retry(wait_fixed=1000, stop_max_delay=60 * 1000) def check_content(): rc, pv_content, _ = sdk_cmd.agent_ssh( host, "cat {}/test".format(pv_path)) assert rc == 0 and pv_content.strip() == "this is a test" check_content() # Scale down the Marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config["instances"] = 0 sdk_marathon.update_app(app_config) # Install Hello World sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, PRERESERVED_TASK_COUNT, additional_options=pre_reserved_options, ) # Make sure the persistent volume is still there check_content() # Uninstall Hello World sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Make sure the persistent volume is still there check_content() # Scale back up the marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config["instances"] = 1 sdk_marathon.update_app(app_config) # Make sure the persistent volume is still there check_content() finally: # Reinstall hello world sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, PRERESERVED_TASK_COUNT, additional_options=pre_reserved_options, ) sdk_marathon.destroy_app(marathon_app_name)
def test_backup_and_restore_to_s3_compatible_storage() -> None: try: sdk_install.install( "minio", "minio", expected_running_tasks=0, package_version="0.0.13-RELEASE.2018-10-06T00-15-16Z", wait_for_deployment=False, ) temp_key_id = os.getenv("AWS_ACCESS_KEY_ID") if not temp_key_id: assert ( False ), 'AWS credentials are required for this test. Disable test with e.g. TEST_TYPES="sanity and not aws"' temp_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") options = "" if sdk_utils.is_strict_mode(): sdk_security.create_service_account( service_account_name="marathon-lb-sa", service_account_secret="marathon-lb/service-account-secret", ) sdk_cmd.run_cli( "security org users grant marathon-lb-sa dcos:service:marathon:marathon:services:/ read" ) sdk_cmd.run_cli( 'security org users grant marathon-lb-sa dcos:service:marathon:marathon:admin:events read --description "Allows access to Marathon events"' ) options = { "marathon-lb": { "secret_name": "marathon-lb/service-account-secret", "marathon-uri": "https://marathon.mesos:8443", } } sdk_install.install( "marathon-lb", "marathon-lb", expected_running_tasks=0, additional_options=options, package_version="1.14.0", wait_for_deployment=False, ) host = sdk_marathon.get_scheduler_host("marathon-lb") _, public_node_ip, _ = sdk_cmd.agent_ssh(host, "curl -s ifconfig.co") minio_endpoint_url = "http://" + public_node_ip + ":9000" os.environ["AWS_ACCESS_KEY_ID"] = config.MINIO_AWS_ACCESS_KEY_ID os.environ["AWS_SECRET_ACCESS_KEY"] = config.MINIO_AWS_SECRET_ACCESS_KEY subprocess.run( [ "aws", "s3", "mb", "s3://" + config.MINIO_BUCKET_NAME, "--endpoint", minio_endpoint_url, ] ) plan_parameters = { "AWS_ACCESS_KEY_ID": os.getenv("AWS_ACCESS_KEY_ID"), "AWS_SECRET_ACCESS_KEY": os.getenv("AWS_SECRET_ACCESS_KEY"), "AWS_REGION": os.getenv("AWS_REGION", "us-west-2"), "S3_BUCKET_NAME": config.MINIO_BUCKET_NAME, "SNAPSHOT_NAME": str(uuid.uuid1()), "CASSANDRA_KEYSPACES": '"testspace1 testspace2"', "S3_ENDPOINT_URL": minio_endpoint_url, } config.run_backup_and_restore( config.get_foldered_service_name(), "backup-s3", "restore-s3", plan_parameters, config.get_foldered_node_address(), ) finally: sdk_install.uninstall("minio", "minio") sdk_install.uninstall("marathon-lb", "marathon-lb") os.environ["AWS_ACCESS_KEY_ID"] = temp_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = temp_secret_access_key
def test_marathon_volume_collision(): # This test validates that a service registered in a sub-role of # slave_public will _not_ unreserve Marathon volumes RESERVED # in the `slave_public` role. # Uninstall HW first sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Install the marathon app marathon_app_name = "persistent-test" volume_name = "persistent-volume" persistent_app = { "id": marathon_app_name, "mem": 128, "user": "******", "cmd": "echo 'this is a test' > {}/test && sleep 10000".format(volume_name), "container": { "type": "MESOS", "volumes": [ { "persistent": {"type": "root", "size": 500, "constraints": []}, "mode": "RW", "containerPath": volume_name, } ], }, } try: sdk_marathon.install_app(persistent_app) # Get its persistent Volume host = sdk_marathon.get_scheduler_host(marathon_app_name) # Should get e.g.: "/var/lib/mesos/slave/volumes/roles/slave_public/persistent-test#persistent-volume#76e7bb6d-64fa-11e8-abc5-8e679b292d5e" rc, pv_path, _ = sdk_cmd.agent_ssh( host, "ls -d /var/lib/mesos/slave/volumes/roles/slave_public/{}#{}#*".format( marathon_app_name, volume_name ), ) if rc != 0: log.error( "Could not get slave_public roles. return-code: '%s'\n", rc) assert rc == 0 pv_path = pv_path.strip() @retrying.retry(wait_fixed=1000, stop_max_delay=60 * 1000) def check_content(): rc, pv_content, _ = sdk_cmd.agent_ssh(host, "cat {}/test".format(pv_path)) assert rc == 0 and pv_content.strip() == "this is a test" check_content() # Scale down the Marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config["instances"] = 0 sdk_marathon.update_app(app_config) # Install Hello World sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options, ) # Make sure the persistent volume is still there check_content() # Uninstall Hello World sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Make sure the persistent volume is still there check_content() # Scale back up the marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config["instances"] = 1 sdk_marathon.update_app(app_config) # Make sure the persistent volume is still there check_content() finally: # Reinstall hello world sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options, ) sdk_marathon.destroy_app(marathon_app_name)
def test_marathon_volume_collission(): # This test validates that a service registered in a sub-role of # slave_public will _not_ unreserve Marathon volumes RESERVED # in the `slave_public` role. # Uninstall HW first sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Install the marathon app marathon_app_name = "persistent-test" persistent_app = { "id": marathon_app_name, "mem": 128, "user": "******", "cmd": "echo 'this is a test' > persistent-volume/test && sleep 10000", "container": { "type": "MESOS", "volumes": [{ "persistent": { "type": "root", "size": 500, "constraints": [] }, "mode": "RW", "containerPath": "persistent-volume" }] } } try: sdk_marathon.install_app(persistent_app) # Get its persistent Volume host = sdk_marathon.get_scheduler_host(marathon_app_name) ok, pv_name = shakedown.run_command_on_agent( host, "ls /var/lib/mesos/slave/volumes/roles/slave_public") assert ok pv_name = pv_name.strip() @retrying.retry(wait_fixed=1000, stop_max_delay=60 * 1000) def check_content(): ok, pv_content = shakedown.run_command_on_agent( host, "cat /var/lib/mesos/slave/volumes/roles/slave_public/{}/test". format(pv_name)) assert pv_content.strip() == "this is a test" check_content() # Scale down the Marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config['instances'] = 0 sdk_marathon.update_app(marathon_app_name, app_config) # Install Hello World sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options) # Make sure the persistent volume is still there check_content() # Uninstall Hello World sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) # Make sure the persistent volume is still there check_content() # Scale back up the marathon app app_config = sdk_marathon.get_config(marathon_app_name) app_config['instances'] = 1 sdk_marathon.update_app(marathon_app_name, app_config) # Make sure the persistent volume is still there check_content() finally: # Reinstall hello world sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=pre_reserved_options) sdk_marathon.destroy_app(marathon_app_name)