示例#1
0
def test__host_maintenance_violate_sla(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. This means that there one instance that is
       unavailable.
    2. Start host maintenance on one of the hosts (say A).
    3. Since one instance is already unavailable, no more instances should be
       killed due to host maintenance. Verify that host A does not transition
       to DOWN.
    """
    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
    stateless_job.job_spec.instance_count = 4
    stateless_job.create()
    stateless_job.wait_for_all_pods_running(num_pods=3)

    # Pick a host that is UP and start maintenance on it
    test_host1 = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host1])
    assert resp

    try:
        wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
        assert len(
            stateless_job.query_pods(states=[pod_pb2.POD_STATE_RUNNING])) == 3
示例#2
0
def test__in_place_update_host_maintenance(stateless_job, maintenance):
    # add enough instances so each host should have some tasks running
    stateless_job.job_spec.instance_count = 9
    # need extra retry attempts, since in-place update would need more time
    # to process given agent is put in maintenance mode
    stateless_job.config = IntegrationTestConfig(
        max_retry_attempts=300,
        pool_file='test_stateless_respool.yaml',
    ),
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 9
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
    update.wait_for_state(goal_state="SUCCEEDED")
示例#3
0
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance):
    """
    1. Create a stateless job with 3 instances.
    2. Create a job update to update the instance job with instance count 2,
    add host-limit-1 constraint and define sla with maximum_unavailable_instances=1
    3. Start host maintenance on one of the hosts
    4. The host should transition to DOWN and the update workflow should SUCCEED
    """
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 2

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=1)
    update.create()

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    update.wait_for_state(goal_state="SUCCEEDED")
    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
示例#4
0
def test__host_maintenance_lifecycle(host_affinity_job, maintenance):
    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(hpb.HOST_STATE_UP)

    # Set host affinity of the job to the selected host
    host_affinity_job.job_config.defaultConfig.constraint.labelConstraint.label.value = (
        test_host)

    host_affinity_job.create()

    # Start maintenance on the selected host
    resp = maintenance["start"]([test_host])
    assert resp

    assert is_host_in_state(test_host, hpb.HOST_STATE_DRAINING)

    # Wait for host to transition to DOWN
    wait_for_host_state(test_host, hpb.HOST_STATE_DOWN)

    # Complete maintenance on the test hosts
    resp = maintenance["stop"]([test_host])
    assert resp

    # Host should no longer be DOWN
    assert not is_host_in_state(test_host, hpb.HOST_STATE_DOWN)

    wait_for_host_state(test_host, hpb.HOST_STATE_UP)
示例#5
0
def test__start_maintenance_kill_tasks(host_affinity_job, maintenance):
    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(hpb.HOST_STATE_UP)

    # Set host affinity of the job to the selected host
    host_affinity_job.job_config.defaultConfig.constraint.labelConstraint.label.value = (
        test_host)

    host_affinity_job.create()
    host_affinity_job.wait_for_state(goal_state="RUNNING")

    def all_running():
        return all(t.state == task.RUNNING
                   for t in host_affinity_job.get_tasks().values())

    host_affinity_job.wait_for_condition(all_running)

    constraint = host_affinity_job.job_config.defaultConfig.constraint
    test_host = constraint.labelConstraint.label.value
    resp = maintenance["start"]([test_host])
    assert resp

    def all_pending():
        return all(t.state == task.PENDING
                   for t in host_affinity_job.get_tasks().values())

    # Wait for tasks to be killed and restarted
    host_affinity_job.wait_for_condition(all_pending)
示例#6
0
def test__host_draining_resumes_on_hostmgr_recovery(host_affinity_job,
                                                    maintenance, resmgr,
                                                    hostmgr):
    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(hpb.HOST_STATE_UP)

    # Set host affinity of the job to the selected host
    host_affinity_job.job_config.defaultConfig.constraint.labelConstraint.label.value = (
        test_host)

    host_affinity_job.create()
    host_affinity_job.wait_for_state(goal_state="RUNNING")

    def all_running():
        return all(t.state == task.RUNNING
                   for t in host_affinity_job.get_tasks().values())

    host_affinity_job.wait_for_condition(all_running)
    constraint = host_affinity_job.job_config.defaultConfig.constraint
    test_host = constraint.labelConstraint.label.value

    # Stop resmgr to ensure maintenance queue is not polled
    resmgr.stop()

    resp = maintenance["start"]([test_host])
    assert resp

    hostmgr.restart()
    resmgr.start()

    # Wait for host to transition to DOWN
    wait_for_host_state(test_host, hpb.HOST_STATE_DOWN)
def test__host_draining_resumes_on_resmgr_recovery(
    host_affinity_job,
    maintenance,
    jobmgr,
    resmgr,
):
    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(hpb.HOST_STATE_UP)

    # Set host affinity of the job to the selected host
    host_affinity_job.job_config.defaultConfig.constraint.labelConstraint.label.value = (
        test_host)

    host_affinity_job.create()

    def all_running():
        return all(t.state == task.RUNNING
                   for t in host_affinity_job.get_tasks().values())

    host_affinity_job.wait_for_condition(all_running)

    constraint = host_affinity_job.job_config.defaultConfig.constraint
    test_host = constraint.labelConstraint.label.value
    resp = maintenance["start"]([test_host])
    assert resp

    # Stop jobmgr to ensure tasks are not killed
    jobmgr.stop()
    # Sleep for one draining period to ensure maintenance queue is polled
    time.sleep(draining_period_sec)
    resmgr.restart()
    jobmgr.start()

    # Wait for host to transition to DOWN
    wait_for_host_state(test_host, hpb.HOST_STATE_DOWN)
示例#8
0
    def test__host_maintenance_violate_sla_restart_jobmgr(self, failure_tester, maintenance):
        """
        1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
        MaximumUnavailableInstances=1. Since there are only 3 UP hosts, one of
        the instances will not get placed (hence unavailable).
        2. Start host maintenance on one of the hosts (say A).
        3. Restart job manager.
        4. Since one instance is already unavailable, no more instances should be
        killed due to host maintenance. Verify that host A does not transition to DOWN
        """
        stateless_job = failure_tester.stateless_job()

        job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
        json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
        stateless_job.job_spec.instance_count = 4
        stateless_job.create()
        stateless_job.wait_for_all_pods_running(num_pods=3)

        # Pick a host that is UP and start maintenance on it
        test_host1 = get_host_in_state(
            host_pb2.HOST_STATE_UP, failure_tester.client)
        # update the client in maintenance fixture
        maintenance["update_client"](failure_tester.client)
        resp = maintenance["start"]([test_host1])
        assert resp

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client
        # update the client of maintainance
        maintenance["update_client"](failure_tester.client)

        try:
            wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
            assert False, 'Host should not transition to DOWN'
        except:
            assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
            assert len(stateless_job.query_pods(
                states=[pod_pb2.POD_STATE_RUNNING])) == 3