def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() common.block_iptable_rules_for_seconds(original_leader, 2181, sleep_seconds=30) common.assert_marathon_leadership_changed(original_leader)
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() common.block_iptable_rules_for_seconds(original_leader, 2181, sleep_seconds=30) common.assert_marathon_leadership_changed(original_leader)
def test_marathon_delete_leader(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) common.abdicate_marathon_leader() wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") common.assert_marathon_leadership_changed(original_leader)
def test_marathon_delete_leader(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) common.abdicate_marathon_leader() wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") common.assert_marathon_leadership_changed(original_leader)
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) common.assert_marathon_leadership_changed(original_leader)
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master # Marathon has a Mesos heartbeat interval of 15 seconds. If 5 are missed it # disconnects. Thus we should wait more than 75 seconds. common.block_iptable_rules_for_seconds(original_leader, 5050, sleep_seconds=100, block_input=False, block_output=True) common.assert_marathon_leadership_changed(original_leader)
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master # Marathon has a Mesos heartbeat interval of 15 seconds. If 5 are missed it # disconnects. Thus we should wait more than 75 seconds. common.block_iptable_rules_for_seconds(original_leader, 5050, sleep_seconds=100, block_input=False, block_output=True) common.assert_marathon_leadership_changed(original_leader)
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in get_all_master_ips(): run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) # Abdicate the leader with backup original_leader = marathon_leader_ip() params = '?backup={}'.format(backup_url1) common.abdicate_marathon_leader(params) common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) common.deployment_wait(service_id=app_id) check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) params = '?backup={}'.format(backup_url2) print('DELETE /v2/leader{}'.format(params)) common.abdicate_marathon_leader(params) common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) # abdicate leader after app was started successfully common.abdicate_marathon_leader() common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) original_leader = marathon_leader_ip() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) common.deployment_wait(service_id=app_id) try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.abdicate_marathon_leader() common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected"
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # abdicate leader after app was started successfully common.abdicate_marathon_leader() wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) original_leader = marathon_leader_ip() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) deployment_wait(service_id=app_id) try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.abdicate_marathon_leader() wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected"
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in get_all_master_ips(): run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = marathon_leader_ip() params = '?backup={}'.format(backup_url1) common.abdicate_marathon_leader(params) wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) deployment_wait(service_id=app_id) check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) params = '?backup={}'.format(backup_url2) print('DELETE /v2/leader{}'.format(params)) common.abdicate_marathon_leader(params) wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)