예제 #1
0
def test__host_maintenance_within_sla_limit(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=4) and MaximumUnavailableInstances=1.
       Wait for all pods to reach RUNNING state. This means there is at least one
       host with more than one instance.
    2. Start host maintenance on a host (say A) with more than 1 instance.
    3. Pods on the host A should get killed in a way (1 at a time)
       that doesn't violate the SLA and host A should transition to DOWN
    """
    stateless_job.job_spec.instance_count = 4
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0]
        for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1))
    ]

    # Pick a host that has pods running on it and start maintenance on it.
    test_host = sorted_hosts[0]

    resp = maintenance["start"]([test_host])
    assert resp

    # Wait for host to transition to DOWN
    attempts = 0
    max_retry_attempts = 20

    log.info(
        "%s waiting for state %s",
        test_host,
        host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN),
    )
    while attempts < max_retry_attempts:
        try:
            if is_host_in_state(test_host, host_pb2.HOST_STATE_DOWN):
                break

            # if the number of available pods is less than 2 (instance_count -
            # maximum_unavailable_instances) fail the test
            if len(stateless_job.query_pods(
                    states=[pod_pb2.POD_STATE_RUNNING])) < 2:
                assert False
        except Exception as e:
            log.warn(e)
        finally:
            time.sleep(5)
            attempts += 1

    if attempts == max_retry_attempts:
        log.info(
            "%s max attempts reached to wait for host state %s",
            test_host,
            host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN),
        )
        assert False
예제 #2
0
def test__update_with_host_maintenance_and_agent_down(stateless_job,
                                                      maintenance):
    """
    1. Create a large stateless job (that take up more than two-thirds of
       the cluster resources) with MaximumUnavailableInstances=2.
    2. Start host maintenance on one of the hosts (say A) having pods of the job.
       MaximumUnavailableInstances=2 ensures that not more than 2 pods are
       unavailable due to host maintenance at a time.
    3. Take down another host which has pods running on it. This will TASK_LOST
       to be sent for all pods on the host after 75 seconds.
    4. Start an update to modify the instance spec of one of the pods.
    5. Since TASK_LOST would cause the job SLA to be violated, instances on the
       host A should not be killed once LOST event is received. Verify that
       host A does not transition to DOWN.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3
    stateless_job.job_spec.sla.maximum_unavailable_instances = 2
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    # pick another host which has pods of the job to take down
    host_container = get_container([sorted_hosts[1]])

    try:
        host_container.stop()
        maintenance["start"]([test_host])

        stateless_job.job_spec.instance_spec[10].containers.extend([
            pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec(
                disk_limit_mb=20))
        ])
        update = StatelessUpdate(stateless_job,
                                 updated_job_spec=stateless_job.job_spec,
                                 batch_size=0)
        update.create()
        update.wait_for_state(goal_state="SUCCEEDED")

        stateless_job.stop()

        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
        pass
    finally:
        host_container.start()
예제 #3
0
def get_host_in_state(state):
    """
    returns a host in the specified state. Note that the caller should make sure
    there is at least one host in the the requested state.
    :param state: host_pb2.HostState
    :return: Hostname of a host in the specified state
    """
    resp = query_hosts([state])
    assert len(resp.host_infos) > 0
    return resp.host_infos[0].hostname
예제 #4
0
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance):
    """
    1. Create a stateless job with 6 instances. Wait for all instances to reach
       RUNNING state. This means that there is at least one host with 2 or more
       instances on it
    2. Start a bad job update with max failure tolerance of 1 and auto-rollback
       disabled.
    3. Start host maintenance on one of the hosts (say host A).
    4. Wait for the update to fail. There should be 2 instances unavailable.
    5. Since 2 instances are already unavailable and
       maximum_unavailable_instances=1, host maintenance should not proceed.
       Verify that the host A doesn't transition to DOWN.
    """
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.job_spec.instance_count = 6
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 6
    updated_job_spec.sla.maximum_unavailable_instances = 1
    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
        batch_size=2,
    )
    update.create()

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    maintenance["start"]([test_host])

    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    try:
        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
예제 #5
0
def test__host_maintenance_no_sla_defined_for_job(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=3) without SLA defined
    2. Start host maintenance on all 3 hosts.
    3. The pods should get killed and all the hosts should transition to DOWN
    """
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = query_hosts([]).host_infos
    maintenance["start"]([h.hostname for h in hosts])

    for h in hosts:
        wait_for_host_state(h.hostname, host_pb2.HOST_STATE_DOWN)