예제 #1
0
def test__host_limit(peloton_client):
    job = Job(
        client=peloton_client,
        job_file="test_stateless_job_host_limit_1.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2),
    )
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    # All running tasks should have different hosts
    def different_hosts_for_running_tasks():
        hosts = set()
        num_running, num_pending = 0, 0
        tasks = job.list_tasks().value
        for id, t in tasks.items():
            if t.runtime.state == task_pb2.TaskState.Value("RUNNING"):
                num_running = num_running + 1
                hosts.add(t.runtime.host)
            if t.runtime.state == task_pb2.TaskState.Value("PENDING"):
                num_pending = num_pending + 1

        # number of running tasks should be equal to the size of the hosts set
        # there should be 1 task in PENDING
        return len(hosts) == num_running and num_pending == 1

    job.wait_for_condition(different_hosts_for_running_tasks)

    job.stop()
    job.wait_for_state(goal_state="KILLED")
예제 #2
0
def test__preemption_task_level(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job_preemption_override.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100,
                                     sleep_time_sec=10),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    # odd instance ids should be preempted
    expected_preempted_tasks = set([1, 3, 5, 7, 9, 11])
    # even instance ids should be running
    expected_running_tasks = set([0, 2, 4, 6, 8, 10])

    preempted_task_set, running_task_set = set([]), set([])

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        preempted_task_set.clear()
        running_task_set.clear()
        preempted_count, running_count = 0, 0
        for t in p_job_a.get_tasks().values():
            # tasks should be KILLED since killOnPreempt is set to true
            if t.state == task.KILLED:
                preempted_count += 1
                preempted_task_set.add(t.instance_id)
            if t.state == task.RUNNING:
                running_count += 1
                running_task_set.add(t.instance_id)

        return running_count == 6 and preempted_count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(),
    )
    # starting the second job should change the entitlement calculation and
    # start preempting tasks from p_job_a
    p_job_b.create()

    # 6 tasks(odd instance ids) should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # check the preempted tasks and check instance ids should be odd.
    assert preempted_task_set == expected_preempted_tasks
    assert running_task_set == expected_running_tasks

    # wait for p_job_b to start running
    p_job_b.wait_for_state("RUNNING")

    kill_jobs([p_job_a, p_job_b])
예제 #3
0
def test_update_job_increase_instances(peloton_client):
    job = Job(
        client=peloton_client,
        job_file="long_running_job.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    # job has only 1 task to begin with
    expected_count = 3

    def tasks_count():
        count = 0
        for t in job.get_tasks().values():
            if t.state == 8 or t.state == 9:
                count += 1

        print("total instances running/completed: %d" % count)
        return count == expected_count

    job.wait_for_condition(tasks_count)

    # update the job with the new config
    job.update(new_job_file="long_running_job_update_instances.yaml")

    # number of tasks should increase to 4
    expected_count = 4
    job.wait_for_condition(tasks_count)
    job.wait_for_state(goal_state="RUNNING")

    kill_jobs([job])
예제 #4
0
def test__preemption_spark_goalstate(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job_preemption_policy.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(
            max_retry_attempts=100, sleep_time_sec=10
        ),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    preempted_task_set = {}

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        count = 0
        for t in p_job_a.get_tasks().values():
            # tasks should be KILLED since killOnPreempt is set to true
            if t.state == task.KILLED:
                count += 1
                preempted_task_set[t] = True
        return count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(),
    )
    # starting the second job should change the entitlement calculation
    p_job_b.create()

    # 6 jobs should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # check the preempted tasks and check the runtime info.
    for t in preempted_task_set:
        assert t.state == task.KILLED
        assert t.goal_state == task.PREEMPTING

    kill_jobs([p_job_a, p_job_b])
예제 #5
0
def test__preemption_tasks_reschedules_task(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        count = 0
        for t in p_job_a.get_tasks().values():
            # tasks should be enqueued back by the jobmanager and once
            # enqueued they should transition to PENDING state
            if t.state == task.PENDING:
                count += 1
        return count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(
            max_retry_attempts=100, sleep_time_sec=10
        ),
    )
    # starting the second job should change the entitlement calculation
    p_job_b.create()

    # 6 tasks should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # p_job_b should succeed
    p_job_b.wait_for_state(goal_state="SUCCEEDED")

    kill_jobs([p_job_a, p_job_b])
예제 #6
0
def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines):
    # Tests that a if task is deleted which is in READY state in resource
    # manager and if is re-enqueued succeeds.

    # stop the placement engines to keep the tasks in READY state
    placement_engines.stop()

    # decorate the client to add peloton private API stubs
    c = with_private_stubs(Client())

    # create long running job with 2 instances
    long_running_job = Job(
        job_file='long_running_job.yaml',
        options=[
            with_instance_count(2),
        ],
        client=c,
    )

    long_running_job.create()
    long_running_job.wait_for_state(goal_state='PENDING')

    task = long_running_job.get_task(0)
    # wait for task to reach READY
    task.wait_for_pending_state(goal_state='READY')

    # kill the task
    task.stop()

    # re-enqueue the task
    task.start()

    # gentlemen, start your (placement) engines
    placement_engines.start()

    def wait_for_instance_to_run():
        return long_running_job.get_task(0).state_str == 'RUNNING'

    long_running_job.wait_for_condition(wait_for_instance_to_run)
예제 #7
0
def test__tasks_reserve_execution(hostreservepool, peloton_client):
    p_job_median = Job(
        client=peloton_client,
        job_file='test_hostreservation_job_median.yaml',
        pool=hostreservepool,
        config=IntegrationTestConfig(
            max_retry_attempts=100,
            sleep_time_sec=1),
    )

    p_job_median.create()
    p_job_median.wait_for_state(goal_state='RUNNING')

    # we should have all 3 tasks in running state
    def all_running():
        return all(t.state == task.RUNNING for t in p_job_median.get_tasks().values())

    p_job_median.wait_for_condition(all_running)

    # decorate the client to add peloton private API stubs
    client = with_private_stubs(peloton_client)

    p_job_large = Job(
        job_file='test_hostreservation_job_large.yaml',
        pool=hostreservepool,
        config=IntegrationTestConfig(
            sleep_time_sec=1,
            max_retry_attempts=300),
        options=[with_instance_count(1)],
        client=client,
    )
    p_job_large.create()
    p_job_large.wait_for_state(goal_state='PENDING')

    request = hostmgr.GetHostsByQueryRequest()

    # task should get into reserved state and RUNNING state
    t1 = p_job_large.get_task(0)
    t1.wait_for_pending_state(goal_state="RESERVED")

    # the task is running on reserved host
    def get_reserved_host():
        resp = client.hostmgr_svc.GetHostsByQuery(
            request,
            metadata=p_job_large.client.hostmgr_metadata,
            timeout=p_job_large.config.rpc_timeout_sec,)

        for h in resp.hosts:
            if h.status == 'reserved':
                return h.hostname
        return ''

    def is_reserved():
        return get_reserved_host() != ''

    p_job_large.wait_for_condition(is_reserved)
    reserved_host = get_reserved_host()

    t1.wait_for_pending_state(goal_state="RUNNING")
    assert reserved_host == t1.get_info().runtime.host

    # p_job_large should succeed
    p_job_large.wait_for_state()

    # no host is in reserved state
    response = client.hostmgr_svc.GetHostsByQuery(
        request,
        metadata=p_job_large.client.hostmgr_metadata,
        timeout=p_job_large.config.rpc_timeout_sec,)
    for host in response.hosts:
        assert host.status != 'reserved'

    kill_jobs([p_job_median, p_job_large])
예제 #8
0
def test__preemption_non_preemptible_task(respool_a, respool_b):
    # Create 2 Jobs : 1 preemptible and 1 non-preemptible in respool A
    p_job_a = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100,
                                     sleep_time_sec=10),
    )
    p_job_a.update_instance_count(6)

    np_job_a = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(),
    )
    np_job_a.job_config.sla.preemptible = False
    np_job_a.update_instance_count(6)

    # preemptible job takes 6 CPUs
    p_job_a.create()

    # non preemptible job takes 6 reserved CPUs
    np_job_a.create()

    p_job_a.wait_for_state("RUNNING")
    np_job_a.wait_for_state("RUNNING")

    # pool allocation is more than reservation
    assert np_job_a.pool.get_reservation("cpu") < np_job_a.pool.get_allocation(
        "cpu")

    # Create another job in respool B
    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(max_retry_attempts=100,
                                     sleep_time_sec=10),
    )
    p_job_b.update_instance_count(6)

    p_job_b.create()

    # p_job_b should remain PENDING since all resources are used by
    # p_job_a
    p_job_b.wait_for_state("PENDING")

    # p_job_a should be preempted and go back to PENDING
    p_job_a.wait_for_state(goal_state="PENDING")

    # np_job_a should keep RUNNING
    np_job_a.wait_for_state("RUNNING")

    def all_tasks_running():
        count = 0
        for t in np_job_a.get_tasks().values():
            if t.state == task.RUNNING:
                count += 1
        return count == 6

    # p_job_b should start running
    p_job_b.wait_for_condition(all_tasks_running)

    # pool A allocation is equal to reservation
    assert np_job_a.pool.get_reservation(
        "cpu") == np_job_a.pool.get_allocation("cpu")

    # pool B allocation is equal to reservation
    assert p_job_b.pool.get_reservation("cpu") == p_job_b.pool.get_allocation(
        "cpu")

    # wait for p_job_b to finish
    p_job_b.wait_for_state("SUCCEEDED")

    # make sure p_job_a starts running
    p_job_a.wait_for_state("RUNNING")

    kill_jobs([p_job_a, np_job_a, p_job_b])