示例#1
0
def test_marathon_when_disconnected_from_zk():
    """Launches an app from Marathon, then knocks out access to ZK from Marathon.
       Verifies the task is preserved.
    """

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    tasks = client.get_tasks(app_def["id"])
    original_task_id = tasks[0]['id']

    common.block_iptable_rules_for_seconds(host,
                                           2181,
                                           sleep_seconds=10,
                                           block_input=True,
                                           block_output=False)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_def["id"])
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)

    common.kill_process_on_host(host, '[s]leep')
    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_id = app_def['id']

    host = common.ip_other_than_mom()
    logger.info('Constraint set to host: {}'.format(host))
    # the size of cpus is designed to be greater than 1/2 of a node
    # such that only 1 task can land on the node.
    cores = common.cpus_on_agent(host)
    app_def['cpus'] = max(0.6, cores - 0.5)
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    client.scale_app(app_id, 2)

    time.sleep(5)
    deployments = client.get_deployments(app_id=app_id)
    tasks = client.get_tasks(app_id)

    # still deploying
    assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments))
    assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
示例#5
0
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_def['cpus'] = 3.5
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    client.scale_app(app_def["id"], 2)

    time.sleep(5)
    deployments = client.get_deployments()
    tasks = client.get_tasks(app_def["id"])

    # still deploying
    assert len(
        deployments
    ) == 1, "The number of deployments is {}, but 1 was expected".format(
        len(deployments))
    assert len(
        tasks) == 1, "The number of tasks is {}, but 1 was expected".format(
            len(tasks))
示例#6
0
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        common.wait_for_service_endpoint('marathon-user', path="ping")

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
示例#7
0
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    tasks = client.get_tasks(app_def["id"])

    common.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_def["id"])
        assert tasks[0]['id'] != new_tasks[0][
            'id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait(app_id=app_def["id"])

    tasks = client.get_tasks(app_def["id"])
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    shakedown.kill_process_on_host(host, '[s]leep 1000')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_new_task_id():
        tasks = client.get_tasks(app_def["id"])
        new_task_id = tasks[0]['id']
        assert old_task_id != new_task_id, "The task ID has not changed: {}".format(
            old_task_id)

    check_new_task_id()
示例#9
0
def test_marathon_with_master_process_failure(marathon_service_name):
    """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and
       the task ID stays the same.
    """

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    tasks = client.get_tasks(app_def["id"])
    original_task_id = tasks[0]['id']

    common.systemctl_master('restart')
    common.wait_for_service_endpoint(marathon_service_name, path="ping")

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_recovery():
        tasks = client.get_tasks(app_def["id"])
        assert len(
            tasks
        ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(
            len(tasks))
        assert tasks[0]['id'] == original_task_id, \
            "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id'])

    check_task_recovery()
def test_marathon_with_master_process_failure(marathon_service_name):
    """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and
       the task ID stays the same.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.systemctl_master('restart')
    shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name, path="ping")

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_task_recovery():
        tasks = client.get_tasks(app_id)
        assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks))
        assert tasks[0]['id'] == original_task_id, \
            "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id'])

    check_task_recovery()
示例#11
0
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_id = app_def['id']

    host = common.ip_other_than_mom()
    print('Constraint set to host: {}'.format(host))
    # the size of cpus is designed to be greater than 1/2 of a node
    # such that only 1 task can land on the node.
    cores = common.cpus_on_agent(host)
    app_def['cpus'] = max(0.6, cores - 0.5)
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait(app_id=app_id)
    client.scale_app(app_id, 2)

    time.sleep(5)
    deployments = client.get_deployments(app_id=app_id)
    tasks = client.get_tasks(app_id)

    # still deploying
    assert len(
        deployments
    ) == 1, "The number of deployments is {}, but 1 was expected".format(
        len(deployments))
    assert len(
        tasks) == 1, "The number of tasks is {}, but 1 was expected".format(
            len(tasks))
示例#12
0
def test_marathon_when_task_agent_bounced():
    """Launch an app and restart the node the task is running on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']
    restart_agent(host)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
def test_mom_when_mom_agent_bounced():
    """Launch an app from MoM and restart the node MoM is on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    mom_ip = common.ip_of_mom()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.restart_agent(mom_ip)

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def test_marathon_when_disconnected_from_zk():
    """Launches an app from Marathon, then knocks out access to ZK from Marathon.
       Verifies the task is preserved.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
def test_pinned_task_scales_on_host_only():
    """Tests that a pinned app scales only on the pinned node."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks))
    assert tasks[0]['host'] == host, \
        "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host)

    client.scale_app(app_id, 10)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 10, "The number of tasks is {} after scale, but 10 was expected".format(len(tasks))
    for task in tasks:
        assert task['host'] == host, "The task is on {}, but it is supposed to be on {}".format(task['host'], host)
示例#16
0
def test_pinned_task_scales_on_host_only():
    """Tests that a pinned app scales only on the pinned node."""

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()

    tasks = client.get_tasks(app_def["id"])
    assert len(
        tasks
    ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(
        len(tasks))
    assert tasks[0]['host'] == host, \
        "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host)

    client.scale_app(app_def["id"], 10)
    shakedown.deployment_wait()

    tasks = client.get_tasks(app_def["id"])
    assert len(
        tasks
    ) == 10, "The number of tasks is {} after scale, but 10 was expected".format(
        len(tasks))
    for task in tasks:
        assert task[
            'host'] == host, "The task is on {}, but it is supposed to be on {}".format(
                task['host'], host)
示例#17
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("No such user 'bad'")), max_attempts=30))
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("No such user 'bad'")), max_attempts=30))
def test_run_app_with_non_downloadable_artifact():
    """Runs an app with a non-downloadable artifact."""

    app_def = apps.sleep_app()
    app_def['fetch'] = [{"uri": "http://localhost/missing-artifact"}]

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("Failed to fetch all URIs for container")), max_attempts=30)) # NOQA E501
示例#20
0
def test_run_app_with_non_downloadable_artifact():
    """Runs an app with a non-downloadable artifact."""

    app_def = apps.sleep_app()
    app_def['fetch'] = [{"uri": "http://localhost/missing-artifact"}]

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("Failed to fetch all URIs for container")), max_attempts=30)) # NOQA E501
def test_mom_with_network_failure_bounce_master():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    # get MoM ip
    mom_ip = common.ip_of_mom()
    logger.info("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']
        logger.info("\nTask IP: " + task_ip)

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bounce master
    shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master")

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    common.wait_for_service_endpoint('marathon-user',
                                     timedelta(minutes=10).total_seconds(),
                                     path="ping")

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'),
                                timedelta(minutes=10).total_seconds())

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def simple_sleep_app(name):
    # Deploy a simple sleep app in the MoM-EE
    with shakedown.marathon_on_marathon(name=name):
        client = marathon.create_client()

        app_def = apps.sleep_app()
        client.add_app(app_def)
        shakedown.deployment_wait()

        tasks = shakedown.get_service_task(name, app_def["id"].lstrip("/"))
        print('MoM-EE tasks: {}'.format(tasks))
        return tasks is not None
def simple_sleep_app(mom_endpoint):
    # Deploy a simple sleep app in the MoM-EE
    with marathon_on_marathon(name=mom_endpoint) as client:
        app_def = apps.sleep_app()
        app_id = app_def["id"]

        client.add_app(app_def)
        common.deployment_wait(service_id=app_id, client=client)

        tasks = get_service_task(mom_endpoint, app_id.lstrip("/"))
        logger.info('MoM-EE tasks: {}'.format(tasks))
        return tasks is not None
def simple_sleep_app(mom_endpoint):
    # Deploy a simple sleep app in the MoM-EE
    with marathon_on_marathon(name=mom_endpoint) as client:
        app_def = apps.sleep_app()
        app_id = app_def["id"]

        client.add_app(app_def)
        deployment_wait(service_id=app_id, client=client)

        tasks = get_service_task(mom_endpoint, app_id.lstrip("/"))
        logger.info('MoM-EE tasks: {}'.format(tasks))
        return tasks is not None
示例#25
0
def test_declined_offer_due_to_resource_role():
    """Tests that an offer gets declined because no resources are allocated for the role.
       In the multi role world Marathon does not accept an `acceptedResourceRole` which is not also
       the app `role` (it doesn't make sense, since the app will never start).
       In oder to use an acceptedResourceRoles: ["very-random-role"] we need to deploy the app
       in a top-level group with the same name ("very-random-role") and since enforceRole is by
       default false, we also set the role field explicitly (to the same value).
    """

    app_def = apps.sleep_app(
        app_id="/very-random-role/sleep-that-doesnt-start-because-no-resources"
    )
    app_def["role"] = "very-random-role"
    app_def["acceptedResourceRoles"] = ["very-random-role"]
    _test_declined_offer(app_def, 'UnfulfilledRole')
示例#26
0
def test_app_with_no_health_check_not_healthy():
    """Makes sure that no task is marked as healthy if no health check is defined for the corresponding app."""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()

    app = client.get_app(app_def["id"])

    assert app['tasksRunning'] == 1, \
        "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning'])
    assert app['tasksHealthy'] == 0, \
        "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy'])
示例#27
0
def test_app_with_no_health_check_not_healthy():
    """Makes sure that no task is marked as healthy if no health check is defined for the corresponding app."""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()

    app = client.get_app(app_def["id"])

    assert app['tasksRunning'] == 1, \
        "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning'])
    assert app['tasksHealthy'] == 0, \
        "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy'])
示例#28
0
def test_run_app_with_specified_user():
    """Runs an app with a given user (core). CoreOS is expected, since it has core user by default."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    tasks = client.get_tasks(app_def["id"])
    task = tasks[0]
    assert task['state'] == 'TASK_RUNNING', "The task is not running: {}".format(task['state'])

    app = client.get_app(app_def["id"])
    assert app['user'] == 'core', "The app's user is not core: {}".format(app['user'])
def test_mom_with_network_failure_bounce_master():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    # get MoM ip
    mom_ip = common.ip_of_mom()
    print("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']
        print("\nTask IP: " + task_ip)

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bounce master
    shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master")

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds())

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds())

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
示例#30
0
def test_marathon_backup_and_restore_leader(marathon_service_name):
    """Backup and restore meeting is done with only one master since new master has to be able
       to read the backup file that was created by the previous master and the easiest way to
       test it is when there is 1 master
    """

    backup_file = 'backup.tar'
    backup_dir = '/tmp'
    backup_url = 'file://{}/{}'.format(backup_dir, backup_file)

    # Deploy a simple test app. It is expected to be there after leader reelection
    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app[
        'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(
            app["tasksRunning"])
    task_id = app['tasks'][0]['id']

    # Abdicate the leader with backup and restore
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    params = '?backup={}&restore={}'.format(backup_url, backup_url)
    print('DELETE /v2/leader{}'.format(params))
    common.abdicate_marathon_leader(params)

    # Wait for new leader (but same master server) to be up and ready
    common.wait_for_service_endpoint(marathon_service_name,
                                     timedelta(minutes=5).total_seconds(),
                                     path="ping")
    app = client.get_app(app_id)
    assert app[
        'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(
            app["tasksRunning"])
    assert task_id == app['tasks'][0][
        'id'], "Task has a different ID after restore"

    # Check if the backup file exits and is valid
    cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file)
    status, data = run_command_on_master(cmd)
    assert status, 'Failed to validate backup file {}'.format(backup_url)
    assert int(data.rstrip()) > 0, "Backup file is empty"
示例#31
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "Failed to get user information for 'bad'"
        assert error in message, "Launched an app with a non-existing user: {}".format(app['user'])

    check_failure_message()
示例#32
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "Failed to get user information for 'bad'"
        assert error in message, "Launched an app with a non-existing user: {}".format(app['user'])

    check_failure_message()
示例#33
0
def test_pinned_task_does_not_find_unknown_host():
    """Tests that a task pinned to an unknown host will not launch.
       Within 10 secs it should still be in deployment and 0 tasks should be running.
    """

    app_def = apps.sleep_app()
    common.pin_to_host(app_def, '10.255.255.254')

    client = marathon.create_client()
    client.add_app(app_def)

    # apps deploy within secs
    # assuming that after 10 no tasks meets criteria
    time.sleep(10)

    tasks = client.get_tasks(app_def["id"])
    assert len(tasks) == 0, "The number of tasks is {}, 0 was expected".format(len(tasks))
示例#34
0
def test_run_app_with_non_downloadable_artifact():
    """Runs an app with a non-downloadable artifact."""

    app_def = apps.sleep_app()
    app_def['fetch'] = [{"uri": "http://localhost/missing-artifact"}]

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "Failed to fetch all URIs for container"
        assert error in message, "Launched an app with a non-downloadable artifact"

    check_failure_message()
示例#35
0
def test_run_app_with_specified_user():
    """Runs an app with a given user (core). CoreOS is expected, since it has core user by default."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait(app_id=app_id)

    tasks = client.get_tasks(app_id)
    task = tasks[0]
    assert task['state'] == 'TASK_RUNNING', "The task is not running: {}".format(task['state'])

    app = client.get_app(app_def["id"])
    assert app['user'] == 'core', "The app's user is not core: {}".format(app['user'])
示例#36
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "No such user 'bad'"
        assert error in message, f"Did not receive expected error messsage \"{error}\" but \"{message}\"" # noqa E999

    check_failure_message()
示例#37
0
def test_run_app_with_non_downloadable_artifact():
    """Runs an app with a non-downloadable artifact."""

    app_def = apps.sleep_app()
    app_def['fetch'] = [{"uri": "http://localhost/missing-artifact"}]

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "Failed to fetch all URIs for container"
        assert error in message, "Launched an app with a non-downloadable artifact"

    check_failure_message()
示例#38
0
def test_run_app_with_specified_user():
    """Runs an app with a given user (cnetos). CentOS is expected, since it has centos user by default."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    task = tasks[0]
    assert task['state'] == 'TASK_RUNNING', "The task is not running: {}".format(task['state'])

    app = client.get_app(app_id)
    assert app['user'] == 'centos', "The app's user is not centos: {}".format(app['user'])
示例#39
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_failure_message():
        app = client.get_app(app_def["id"])
        message = app['lastTaskFailure']['message']
        error = "No such user 'bad'"
        assert error in message, f"Did not receive expected error messsage \"{error}\" but \"{message}\"" # noqa E999

    check_failure_message()
def test_pinned_task_does_not_find_unknown_host():
    """Tests that a task pinned to an unknown host will not launch.
       Within 10 secs it should still be in deployment and 0 tasks should be running.
    """

    app_def = apps.sleep_app()
    common.pin_to_host(app_def, '10.255.255.254')

    client = marathon.create_client()
    client.add_app(app_def)

    # apps deploy within secs
    # assuming that after 10 no tasks meets criteria
    time.sleep(10)

    tasks = client.get_tasks(app_def["id"])
    assert len(tasks) == 0, "The number of tasks is {}, 0 was expected".format(len(tasks))
def test_run_app_with_specified_user():
    """Runs an app with a given user (cnetos). CentOS is expected, since it has centos user by default."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    task = tasks[0]
    assert task['state'] == 'TASK_RUNNING', "The task is not running: {}".format(task['state'])

    app = client.get_app(app_id)
    assert app['user'] == 'centos', "The app's user is not centos: {}".format(app['user'])
示例#42
0
def test_default_user():
    """Ensures a task is started as root by default."""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()

    app = client.get_app(app_def["id"])
    user = app.get('user')
    assert user is None, "User is {}, but it should not have been set".format(user)

    tasks = client.get_tasks(app_def["id"])
    host = tasks[0]['host']

    success = shakedown.run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'")
    assert success, "The app is running as non-root"
示例#43
0
def test_default_user():
    """Ensures a task is started as root by default."""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()

    app = client.get_app(app_def["id"])
    user = app.get('user')
    assert user is None, "User is {}, but it should not have been set".format(user)

    tasks = client.get_tasks(app_def["id"])
    host = tasks[0]['host']

    success = shakedown.run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'")
    assert success, "The app is running as non-root"
示例#44
0
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    common.kill_process_on_host(host, '[s]leep 1000')

    assert_that(lambda: client.get_tasks(app_id)[0],
                eventually(has_value('id', not_(equal_to(old_task_id))), max_attempts=30))
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    common.kill_process_on_host(host, '[s]leep 1000')

    assert_that(lambda: client.get_tasks(app_id)[0],
                eventually(has_value('id', not_(equal_to(old_task_id))), max_attempts=30))
def test_mom_with_network_failure():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    mom_ip = common.ip_of_mom()
    logger.info("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with marathon_on_marathon() as client:
        client.add_app(app_def)
        wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping")
    wait_for_task("marathon-user", app_id.lstrip('/'))

    with marathon_on_marathon() as client:
        wait_for_task("marathon-user", app_id.lstrip('/'))

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
示例#47
0
def test_marathon_backup_and_restore_leader(marathon_service_name):
    """Backup and restore meeting is done with only one master since new master has to be able
       to read the backup file that was created by the previous master and the easiest way to
       test it is when there is 1 master
    """

    backup_file = 'backup.tar'
    backup_dir = '/tmp'
    backup_url = 'file://{}/{}'.format(backup_dir, backup_file)

    # Deploy a simple test app. It is expected to be there after leader reelection
    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])
    task_id = app['tasks'][0]['id']

    # Abdicate the leader with backup and restore
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    params = '?backup={}&restore={}'.format(backup_url, backup_url)
    print('DELETE /v2/leader{}'.format(params))
    common.abdicate_marathon_leader(params)

    # Wait for new leader (but same master server) to be up and ready
    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")
    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])
    assert task_id == app['tasks'][0]['id'], "Task has a different ID after restore"

    # Check if the backup file exits and is valid
    cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file)
    status, data = run_command_on_master(cmd)
    assert status, 'Failed to validate backup file {}'.format(backup_url)
    assert int(data.rstrip()) > 0, "Backup file is empty"
示例#48
0
def test_marathon_when_task_agent_bounced():
    """Launch an app and restart the node the task is running on."""

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    tasks = client.get_tasks(app_def["id"])
    original_task_id = tasks[0]['id']
    shakedown.restart_agent(host)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_def["id"])
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
def test_mom_when_mom_agent_bounced():
    """Launch an app from MoM and restart the node MoM is on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    mom_ip = common.ip_of_mom()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with marathon_on_marathon() as client:
        client.add_app(app_def)
        deployment_wait(service_id=app_id, client=client)
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        restart_agent(mom_ip)

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
示例#50
0
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_def['cpus'] = 3.5
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    client.scale_app(app_def["id"], 2)

    time.sleep(5)
    deployments = client.get_deployments()
    tasks = client.get_tasks(app_def["id"])

    # still deploying
    assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments))
    assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
示例#51
0
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait(app_id=app_def["id"])

    tasks = client.get_tasks(app_def["id"])
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    common.kill_process_on_host(host, '[s]leep 1000')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_new_task_id():
        tasks = client.get_tasks(app_def["id"])
        new_task_id = tasks[0]['id']
        assert old_task_id != new_task_id, "The task ID has not changed: {}".format(old_task_id)

    check_new_task_id()
def test_declined_offer_due_to_resource_role():
    """Tests that an offer gets declined because the role doesn't exist."""

    app_def = apps.sleep_app()
    app_def["acceptedResourceRoles"] = ["very_random_role"]
    _test_declined_offer(app_def, 'UnfulfilledRole')
示例#53
0
def test_marathon_delete_leader_and_check_apps(marathon_service_name):
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)
    original_leader = marathon_leader_ip()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances
        assert app['tasksRunning'] == expected_instances, \
            "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def remove_app(app_id):
        client.remove_app(app_id)

    remove_app(app_id)
    deployment_wait(service_id=app_id)

    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there
    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"
示例#54
0
def test_marathon_backup_and_check_apps(marathon_service_name):

    backup_file1 = 'backup1.tar'
    backup_file2 = 'backup2.tar'
    backup_dir = '/tmp'

    for master_ip in get_all_master_ips():
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1))
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2))

    backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1)
    backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2)

    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    params = '?backup={}'.format(backup_url1)
    common.abdicate_marathon_leader(params)

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        try:
            app = client.get_app(app_id)
        except Exception as e:
            if expected_instances != 0:
                raise e
        else:
            if expected_instances == 0:
                assert False, "The application resurrected"
            else:
                app['tasksRunning'] == expected_instances, \
                    "The number of running tasks is {}, but {} was expected".format(
                        app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    # then remove
    client.remove_app(app_id)
    deployment_wait(service_id=app_id)

    check_app_existence(0)

    # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted
    # leads to the state that marathon was not able to re-start, because the second backup failed constantly.

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    params = '?backup={}'.format(backup_url2)
    print('DELETE /v2/leader{}'.format(params))
    common.abdicate_marathon_leader(params)

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    # if leader changed, this means that marathon was able to start again, which is great :-).
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)
def test_command_health_check_healthy():
    """Tests COMMAND health check"""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    assert_app_healthy(client, app_def, common.command_health_check())
示例#56
0
def test_declined_offer_due_to_cpu_requirements():
    """Tests that an offer gets declined because the number of CPUs can't be found in an offer."""

    app_def = apps.sleep_app()
    app_def["cpus"] = 12345
    _test_declined_offer(app_def, 'InsufficientCpus')
示例#57
0
def test_marathon_backup_and_check_apps(marathon_service_name):

    backup_file1 = 'backup1.tar'
    backup_file2 = 'backup2.tar'
    backup_dir = '/tmp'

    for master_ip in shakedown.get_all_master_ips():
        _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1))
        _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2))

    backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1)
    backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2)

    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # Abdicate the leader with backup
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    url = 'v2/leader?backup={}'.format(backup_url1)
    print('DELETE {}'.format(url))
    common.delete_marathon_path(url)

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def marathon_leadership_changed():
        current_leader = shakedown.marathon_leader_ip()
        print('leader: {}'.format(current_leader))
        assert original_leader != current_leader, "A new Marathon leader has not been elected"

    # wait until leader changed
    marathon_leadership_changed()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        try:
            app = client.get_app(app_id)
        except Exception as e:
            if expected_instances != 0:
                raise e
        else:
            if expected_instances == 0:
                assert False, "The application resurrected"
            else:
                app['tasksRunning'] == expected_instances, \
                    "The number of running tasks is {}, but {} was expected".format(
                        app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    # then remove
    client.remove_app(app_id)
    shakedown.deployment_wait()

    check_app_existence(0)

    # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted
    # leads to the state that marathon was not able to re-start, because the second backup failed constantly.

    # Abdicate the leader with backup
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    url = 'v2/leader?backup={}'.format(backup_url2)
    print('DELETE {}'.format(url))
    common.delete_marathon_path(url)

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    # wait until leader changed
    # if leader changed, this means that marathon was able to start again, which is great :-).
    marathon_leadership_changed()

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)
def test_declined_offer_due_to_cpu_requirements():
    """Tests that an offer gets declined because the number of CPUs can't be found in an offer."""

    app_def = apps.sleep_app()
    app_def["cpus"] = 12345
    _test_declined_offer(app_def, 'InsufficientCpus')
示例#59
0
def test_marathon_delete_leader_and_check_apps(marathon_service_name):
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def marathon_leadership_changed():
        current_leader = shakedown.marathon_leader_ip()
        print('leader: {}'.format(current_leader))
        if original_leader == current_leader:
            common.delete_marathon_path('v2/leader')
        assert original_leader != current_leader, "A new Marathon leader has not been elected"

    # wait until leader changed
    marathon_leadership_changed()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances
        assert app['tasksRunning'] == expected_instances, \
            "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def remove_app(app_id):
        client.remove_app(app_id)

    remove_app(app_id)
    shakedown.deployment_wait()

    try:
        _ = client.get_app(app_id)
    except:
        pass
    else:
        assert False, "The application resurrected"

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    # wait until leader changed
    marathon_leadership_changed()

    # check if app definition is still not there
    try:
        _ = client.get_app(app_id)
    except:
        pass
    else:
        assert False, "The application resurrected"
示例#60
0
def test_command_health_check_healthy():
    """Tests COMMAND health check"""

    app_def = apps.sleep_app()
    client = marathon.create_client()
    assert_app_healthy(client, app_def, common.command_health_check())