예제 #1
0
def test__placement_exclusive_job(exclusive_host):
    excl_constraint = pod_pb2.Constraint(
        type=1,  # Label constraint
        label_constraint=pod_pb2.LabelConstraint(
            kind=2,  # Host
            condition=2,  # Equal
            requirement=1,
            label=peloton_pb2_v1alpha.Label(key="peloton/exclusive",
                                            value="exclusive-test-label"),
        ),
    )
    # We have 1 exclusive host and 2 non-exclusive hosts. Set number of
    # instances to be a few more than what can run simulatenously on
    # a single exclusive host
    job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml")
    job.job_spec.default_spec.constraint.CopyFrom(excl_constraint)
    job.job_spec.instance_count = 6
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    job.wait_for_all_pods_running(num_pods=4)

    job.stop()
    job.wait_for_terminated()

    # check that all of them ran on exclusive host
    pod_summaries = job.list_pods()
    for s in pod_summaries:
        if s.status.host:
            assert "exclusive" in s.status.host
예제 #2
0
def test__revocable_tasks_move_to_revocable_queue():
    revocable_job1 = StatelessJob(job_file='test_stateless_job_revocable_spec.yaml')
    revocable_job1.create()
    revocable_job1.wait_for_state(goal_state='RUNNING')
    revocable_job1.wait_for_all_pods_running()

    # 1 task is running out of 3
    def partial_tasks_running():
        count = 0
        for pod_id in range(0, revocable_job2.job_spec.instance_count):
            pod_state = revocable_job2.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 1

    revocable_job2 = StatelessJob(
        job_file='test_stateless_job_revocable_slack_limit_spec.yaml')
    revocable_job2.create()

    # sleep for 5 seconds to make sure job has enough time
    time.sleep(5)
    revocable_job2.wait_for_condition(partial_tasks_running)

    non_revocable_job = StatelessJob(job_file='test_stateless_job_spec.yaml')
    non_revocable_job.create()
    non_revocable_job.wait_for_state('RUNNING')
    non_revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    revocable_job1.stop()
    revocable_job2.stop()
    non_revocable_job.stop()
    revocable_job1.wait_for_terminated()
    revocable_job2.wait_for_terminated()
    non_revocable_job.wait_for_terminated()
예제 #3
0
def test__in_place_kill_job_release_host():
    job1 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
    )
    job1.create()
    job1.wait_for_state(goal_state="RUNNING")

    job2 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
    )
    job2.create()
    job2.wait_for_state(goal_state="RUNNING")

    update1 = StatelessUpdate(job1,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update1.create(in_place=True)
    # stop the update
    job1.stop()

    update2 = StatelessUpdate(job2,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update2.create()

    # both updates should complete
    update1.wait_for_state(goal_state="SUCCEEDED")
    update2.wait_for_state(goal_state="SUCCEEDED")
예제 #4
0
def test__simple_revocable_batch_and_stateless_colocate():
    revocable_stateless_job = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml")
    revocable_stateless_job.create()
    revocable_stateless_job.wait_for_state(goal_state="RUNNING")
    revocable_stateless_job.wait_for_all_pods_running()

    non_revocable_stateless_job = StatelessJob(
        job_file="test_stateless_job_spec.yaml")
    non_revocable_stateless_job.create()
    non_revocable_stateless_job.wait_for_state(goal_state="RUNNING")
    non_revocable_stateless_job.wait_for_all_pods_running()

    revocable_batch_job = Job(
        job_file="test_job_revocable.yaml",
        config=IntegrationTestConfig(pool_file='test_stateless_respool.yaml'),
    )
    revocable_batch_job.create()
    revocable_batch_job.wait_for_state(goal_state="RUNNING")

    non_revocable_batch_job = Job(
        job_file="test_job.yaml",
        config=IntegrationTestConfig(pool_file='test_stateless_respool.yaml'),
    )
    non_revocable_batch_job.create()
    non_revocable_batch_job.wait_for_state(goal_state="RUNNING")

    revocable_batch_job.wait_for_state()
    non_revocable_batch_job.wait_for_state()
예제 #5
0
def test__delete_running_job_without_force_flag():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    try:
        job.delete()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.ABORTED
        return
    raise Exception("job in non-terminal state error not received")
예제 #6
0
def test__create_update_to_unset_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    update = StatelessUpdate(job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
예제 #7
0
def test__create_update_to_disable_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    job.job_spec.default_spec.containers[0].liveness_check.enabled = False
    update = StatelessUpdate(job,
                             updated_job_spec=job.job_spec,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
예제 #8
0
def test__delete_running_job_with_force_flag():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.delete(force_delete=True)
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
예제 #9
0
def test__health_check_detects_healthy_tasks():
    job = StatelessJob(
        job_file='test_stateless_job_successful_health_check_spec.yaml',
        config=IntegrationTestConfig(max_retry_attempts=100))
    job.job_spec.instance_count = 1
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    def task_has_healthy_events():
        for pod_event in job.get_pod(0).get_pod_events():
            if pod_event.healthy == 'HEALTHY':
                return True

    job.wait_for_condition(task_has_healthy_events)
예제 #10
0
def test__kill_sla_violated_job():
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Kill job and wait for the job to reach KILLED state
    """
    job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", )
    job.job_spec.instance_count = 5
    job.create()
    job.wait_for_all_pods_running(num_pods=3)

    job.stop()
    job.wait_for_state(goal_state='KILLED')
예제 #11
0
def test__failed_task_throttled_by_exponential_backoff():
    job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml',
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    time.sleep(40)

    pod_events = job.get_pod(0).get_pod_events()
    # if throttle is effective, the task should not create many
    # pod events. Otherwise it can generate many pod events, during
    # the time window
    pod_id = pod_events[0].pod_id.value
    run_id = int(pod_id[pod_id.rindex('-') + 1:])
    assert 1 < run_id < 20
예제 #12
0
def test__restart_killed_job():
    job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml")
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = job.query_pods()

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    job.restart(in_place=False)

    job.wait_for_all_pods_running()

    new_pod_infos = job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #13
0
def test__delete_job_bad_version():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    try:
        job.delete(entity_version="1-2-3")
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.ABORTED
        assert INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
        return
    raise Exception("entity version mismatch error not received")
예제 #14
0
def test__health_check_detects_unhealthy_tasks():
    job = StatelessJob(
        job_file="test_stateless_job_failed_health_check_spec.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.job_spec.instance_count = 1
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    def task_has_unhealthy_events():
        for pod_event in job.get_pod(0).get_pod_events():
            if pod_event.healthy == "HEALTH_STATE_UNHEALTHY":
                return True

    job.wait_for_condition(task_has_unhealthy_events)
예제 #15
0
def test_update_killed_job(in_place):
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    update = StatelessUpdate(
        job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")

    assert job.get_spec().instance_count == 3
    assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED
예제 #16
0
def test__delete_killed_job():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    job.delete()
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
예제 #17
0
def test__stop_nonrevocable_job_to_free_resources_for_revocable_job():
    non_revocable_job1 = StatelessJob(
        job_file="test_stateless_job_memory_large_spec.yaml"
    )
    non_revocable_job1.create()
    non_revocable_job1.wait_for_state("RUNNING")

    non_revocable_job2 = StatelessJob(
        job_file="test_stateless_preemptible_job_memory_large_spec.yaml"
    )
    non_revocable_job2.create()
    non_revocable_job2.wait_for_state("RUNNING")

    non_revocable_job1.wait_for_all_pods_running()
    non_revocable_job2.wait_for_all_pods_running()

    revocable_job = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml"
    )
    revocable_job.create()

    # no tasks should be running
    def no_task_running():
        count = 0
        for pod_id in range(0, revocable_job.job_spec.instance_count):
            pod_state = revocable_job.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 0

    # give job 5 seconds to run, even after that no tasks should be running
    time.sleep(5)
    revocable_job.wait_for_condition(no_task_running)

    # stop non_revocable job to free up resources for revocable job
    non_revocable_job2.stop()
    non_revocable_job2.wait_for_terminated()

    # After non_revocable job is killed, all revocable tasks should be running
    revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    non_revocable_job1.stop()
    revocable_job.stop()
    non_revocable_job1.wait_for_terminated()
    revocable_job.wait_for_terminated()
예제 #18
0
def test__failed_task_automatically_restart():
    job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml',
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    old_pod_id = job.get_pod(0).get_pod_status().pod_id.value

    def job_not_running():
        return job.get_status().state != 'JOB_STATE_RUNNING'

    job.wait_for_condition(job_not_running)

    def pod_id_changed():
        new_pod_id = job.get_pod(0).get_pod_status().pod_id.value
        return old_pod_id != new_pod_id

    job.wait_for_condition(pod_id_changed)
예제 #19
0
def test__placement_non_exclusive_job(exclusive_host):
    # We have 1 exclusive host and 2 non-exclusive hosts. Set number of
    # instances to be a few more than what can run simulatenously
    # on 2 non-exclusive hosts
    job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml")
    job.job_spec.instance_count = 10
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    job.wait_for_all_pods_running(num_pods=5)

    job.stop()
    job.wait_for_terminated()

    # check that none of them ran on exclusive host
    pod_summaries = job.list_pods()
    for s in pod_summaries:
        if s.status.host:
            assert "exclusive" not in s.status.host
예제 #20
0
def test_stop_running_job_with_active_update_remove_instances(in_place):
    stateless_job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    assert len(stateless_job.query_pods()) == 5

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLING_FORWARD")

    stateless_job.stop()
    update.wait_for_state(goal_state="SUCCEEDED")
    assert stateless_job.get_spec().instance_count == 3
예제 #21
0
def test__create_update_to_change_health_check_config(in_place):
    job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.job_spec.default_spec.containers[0].liveness_check.enabled = False
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.job_spec.default_spec.containers[
        0].liveness_check.initial_interval_secs = 2
    update = StatelessUpdate(
        job,
        updated_job_spec=job.job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
예제 #22
0
def test__create_update_to_set_health_check(in_place):
    job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_SPEC,
        config=IntegrationTestConfig(
            max_retry_attempts=100,
            pool_file='test_stateless_respool.yaml',
        ),
    )
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    update = StatelessUpdate(
        job,
        updated_job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
예제 #23
0
def test__revocable_job_slack_limit():
    revocable_job = StatelessJob(
        job_file="test_stateless_job_revocable_slack_limit_spec.yaml")
    revocable_job.create()
    revocable_job.wait_for_state(goal_state="RUNNING")

    # 2 tasks are running out of 3
    def partial_tasks_running():
        count = 0
        for pod_id in range(0, revocable_job.job_spec.instance_count):
            pod_state = revocable_job.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 2

    revocable_job.wait_for_condition(partial_tasks_running)

    # cleanup job from jobmgr
    revocable_job.stop()
    revocable_job.wait_for_terminated()
예제 #24
0
def test__create_revocable_job():
    revocable_job1 = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml")
    revocable_job1.create()
    revocable_job1.wait_for_state(goal_state="RUNNING")
    revocable_job1.wait_for_all_pods_running()

    revocable_job2 = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml")
    revocable_job2.create()
    revocable_job2.wait_for_state(goal_state="RUNNING")
    revocable_job2.wait_for_all_pods_running()

    non_revocable_job = StatelessJob(
        job_file="test_stateless_job_cpus_large_spec.yaml")
    non_revocable_job.create()
    non_revocable_job.wait_for_state(goal_state="RUNNING")
    non_revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    revocable_job1.stop()
    revocable_job2.stop()
    non_revocable_job.stop()
    revocable_job1.wait_for_terminated()
    revocable_job2.wait_for_terminated()
    non_revocable_job.wait_for_terminated()
예제 #25
0
def test__preempt_revocable_job_to_run_non_revocable_job():
    non_revocable_job1 = StatelessJob(
        job_file="test_stateless_preemptible_job_memory_large_spec.yaml"
    )
    non_revocable_job1.create()
    non_revocable_job1.wait_for_state(goal_state="RUNNING")
    non_revocable_job1.wait_for_all_pods_running()

    revocable_job = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml"
    )
    revocable_job.create()
    revocable_job.wait_for_state(goal_state="RUNNING")
    revocable_job.wait_for_all_pods_running()

    # launch second non-revocable job which will pre-empt revocable job
    non_revocable_job2 = StatelessJob(
        job_file="test_stateless_job_memory_large_spec.yaml"
    )
    non_revocable_job2.create()
    non_revocable_job2.wait_for_state(goal_state="RUNNING")
    non_revocable_job2.wait_for_all_pods_running()

    # no revocable job tasks should be running
    def zero_tasks_running():
        count = 0
        for pod_id in range(0, revocable_job.job_spec.instance_count):
            pod_state = revocable_job.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 0

    revocable_job.wait_for_condition(zero_tasks_running)

    revocable_job.stop()
    non_revocable_job1.stop()
    non_revocable_job2.stop()
    revocable_job.wait_for_terminated()
    non_revocable_job1.wait_for_terminated()
    non_revocable_job2.wait_for_terminated()