示例#1
0
def test__start_stop_task_without_job_id(peloton_client):
    job_without_id = Job(client=peloton_client, )
    resp = job_without_id.start()
    assert resp.HasField("error")
    assert resp.error.HasField("notFound")

    resp = job_without_id.stop()
    assert resp.HasField("error")
    assert resp.error.HasField("notFound")
示例#2
0
def test__start_stop_task_without_job_id():
    job_without_id = Job()
    resp = job_without_id.start()
    assert resp.HasField('error')
    assert resp.error.HasField('notFound')

    resp = job_without_id.stop()
    assert resp.HasField('error')
    assert resp.error.HasField('notFound')
示例#3
0
def test__start_stop_task_with_nonexistent_job_id(peloton_client):
    job_with_nonexistent_id = Job(client=peloton_client)
    job_with_nonexistent_id.job_id = "nonexistent-job-id"
    resp = job_with_nonexistent_id.start()
    assert resp.HasField("error")
    assert resp.error.HasField("notFound")

    resp = job_with_nonexistent_id.stop()
    assert resp.HasField("error")
    assert resp.error.HasField("notFound")
示例#4
0
def test__start_stop_task_with_nonexistent_job_id():
    job_with_nonexistent_id = Job()
    job_with_nonexistent_id.job_id = "nonexistent-job-id"
    resp = job_with_nonexistent_id.start()
    assert resp.HasField('error')
    assert resp.error.HasField('notFound')

    resp = job_with_nonexistent_id.stop()
    assert resp.HasField('error')
    assert resp.error.HasField('notFound')
示例#5
0
def test__create_a_stateless_job_with_3_tasks_on_3_different_hosts():
    label_key = "job.name"
    label_value = "peloton_stateless_job"

    job = Job(
        job_file="test_stateless_job.yaml",
        config=IntegrationTestConfig(
            max_retry_attempts=100,
            pool_file='test_stateless_respool.yaml',
        ),
        options=[
            with_labels({label_key: label_value}),
            with_constraint(_label_constraint(label_key, label_value)),
            with_instance_count(3),
        ],
    )

    job.create()

    job.wait_for_state(goal_state="RUNNING")
    # Determine if tasks run on different hosts
    hosts = set()
    for _, task in job.get_tasks().iteritems():
        task_info = task.get_info()
        hosts = hosts.union({task_info.runtime.host})

    kill_jobs([job])

    # Ensure that the tasks run on 3 different hosts
    assert len(hosts) == 3
示例#6
0
def test_placement_exclusive_job(exclusive_host, peloton_client):
    excl_constraint = task_pb2.Constraint(
        type=1,  # Label constraint
        labelConstraint=task_pb2.LabelConstraint(
            kind=2,  # Host
            condition=2,  # Equal
            requirement=1,
            label=peloton_pb2.Label(
                key="peloton/exclusive", value="exclusive-test-label"
            ),
        ),
    )
    # Set number of instances to be a few more than what can run on
    # a single exclusive host
    job = Job(
        client=peloton_client,
        job_file="long_running_job.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2),
        options=[with_constraint(excl_constraint), with_instance_count(6)],
    )
    job.job_config.defaultConfig.command.value = "sleep 10"
    job.create()
    job.wait_for_state()

    # check that all of them ran on exclusive host
    task_infos = job.list_tasks().value
    for instance_id, task_info in task_infos.items():
        assert "exclusive" in task_info.runtime.host
示例#7
0
def test__run_failing_job():
    job = Job(job_file='test_job_fail.yaml',
              config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='FAILED', failed_state='SUCCEEDED')

    results = job.get_task_runs(0)
    assert len(results) == 4
示例#8
0
def long_running_job(request, peloton_client):
    job = Job(job_file="long_running_job.yaml", client=peloton_client)

    # teardown
    def kill_long_running_job():
        print("\nstopping long running job")
        job.stop()

    request.addfinalizer(kill_long_running_job)

    return job
示例#9
0
def test__create_job(peloton_client):
    job = Job(
        client=peloton_client,
        config=IntegrationTestConfig(max_retry_attempts=100)
    )
    job.create()
    job.wait_for_state()
示例#10
0
def test__create_batch_job():
    job = Job(
        job_file="test_job_no_container.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.create()
    job.wait_for_state()
示例#11
0
def test__run_failing_job(peloton_client):
    job = Job(
        client=peloton_client,
        job_file="test_job_fail.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.create()
    job.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    results = job.get_task_runs(0)
    assert len(results) == 4
示例#12
0
def host_affinity_job(request, peloton_client):
    job = Job(
        job_file="test_job_host_affinity_constraint.yaml",
        client=peloton_client,
    )

    # Kill job
    def kill_host_affinity_job():
        print("\nstopping host affinity job")
        job.stop()

    request.addfinalizer(kill_host_affinity_job)
    return job
示例#13
0
def test__create_job_without_default_config():
    job = Job(config=IntegrationTestConfig(max_retry_attempts=100))
    default_config = job.job_config.defaultConfig
    job.job_config.ClearField('defaultConfig')

    for i in range(0, job.job_config.instanceCount):
        job.job_config.instanceConfig[i].CopyFrom(default_config)

    job.create()
    job.wait_for_state()
示例#14
0
def test_large_job():
    """
    Load test against a cluster, not local minicluster friendly
    """
    job = Job(job_file='test_job_no_container.yaml',
              config=IntegrationTestConfig(max_retry_attempts=1000))
    job.job_config.instanceCount = 10000

    job.create()
    job.wait_for_state()
示例#15
0
def test_job_succeeds_if_controller_task_succeeds():
    # only controller task in cjob would succeed.
    # other tasks would fail, but only controller task should determine
    # job terminal state
    cjob = Job(job_file='test_job_succecced_controller_task.yaml')

    cjob.create()
    cjob.wait_for_state(goal_state='SUCCEEDED')

    kill_jobs([cjob])
示例#16
0
def test_placement_strategy_spread():
    job = Job(
        job_file="test_task.yaml",
        options=[with_instance_count(3)])
    job.job_config.placementStrategy = job_pb2.PLACEMENT_STRATEGY_SPREAD_JOB
    job.create()
    job.wait_for_state()

    # check all of them ran on different hosts
    hosts = set()
    task_infos = job.list_tasks().value
    for instance_id, task_info in task_infos.items():
        assert task_info.runtime.host not in hosts
        hosts.add(task_info.runtime.host)
示例#17
0
def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines):
    # Tests that a if task is deleted which is in READY state in resource
    # manager and if is re-enqueued succeeds.

    # stop the placement engines to keep the tasks in READY state
    placement_engines.stop()

    # decorate the client to add peloton private API stubs
    c = with_private_stubs(Client())

    # create long running job with 2 instances
    long_running_job = Job(
        job_file='long_running_job.yaml',
        options=[
            with_instance_count(2),
        ],
        client=c,
    )

    long_running_job.create()
    long_running_job.wait_for_state(goal_state='PENDING')

    task = long_running_job.get_task(0)
    # wait for task to reach READY
    task.wait_for_pending_state(goal_state='READY')

    # kill the task
    task.stop()

    # re-enqueue the task
    task.start()

    # gentlemen, start your (placement) engines
    placement_engines.start()

    def wait_for_instance_to_run():
        return long_running_job.get_task(0).state_str == 'RUNNING'

    long_running_job.wait_for_condition(wait_for_instance_to_run)
示例#18
0
def test__create_a_batch_job_and_restart_jobmgr_completes_jobs(jobmgr):
    job = Job(job_file='test_job_no_container.yaml',
              config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()

    # Restart immediately. That will lave some fraction unallocated and another
    # fraction initialized.
    jobmgr.restart()

    job.wait_for_state()
    kill_jobs([job])
示例#19
0
def test_placement_non_exclusive_job(exclusive_host):
    # Set number of instances to be a few more than what can run on
    # 2 (non-exclusive) hosts
    job = Job(job_file='long_running_job.yaml',
              config=IntegrationTestConfig(max_retry_attempts=100,
                                           sleep_time_sec=2),
              options=[with_instance_count(12)])
    job.job_config.defaultConfig.command.value = "sleep 10"
    job.create()
    job.wait_for_state()

    # check that none of them ran on exclusive host
    task_infos = job.list_tasks().value
    for instance_id, task_info in task_infos.items():
        assert "exclusive" not in task_info.runtime.host
示例#20
0
def test_placement_strategy_pack():
    job = Job(
        job_file="test_task.yaml",
        options=[with_instance_count(5)])
    job.job_config.placementStrategy = job_pb2.PLACEMENT_STRATEGY_PACK_HOST
    job.create()
    job.wait_for_state()

    # check all of them ran on same host
    the_host = ""
    task_infos = job.list_tasks().value
    for instance_id, task_info in task_infos.items():
        if the_host:
            assert task_info.runtime.host == the_host
        the_host = task_info.runtime.host
示例#21
0
def test_placement_strategy_pack():
    job = Job(job_file="test_task.yaml", options=[with_instance_count(5)])
    """
    TODO Uncomment next line after peloton-client changes
    #job.job_config.placementStrategy = "PLACEMENT_STRATEGY_PACK_HOST"
    """
    job.create()
    job.wait_for_state()

    # check all of them ran on same host
    the_host = ""
    task_infos = job.list_tasks().value
    for instance_id, task_info in task_infos.items():
        if the_host:
            assert task_info.runtime.host == the_host
        the_host = task_info.runtime.host
示例#22
0
def test_job_succeeds_if_controller_task_succeeds(peloton_client):
    # only controller task in cjob would succeed.
    # other tasks would fail, but only controller task should determine
    # job terminal state
    cjob = Job(
        client=peloton_client,
        job_file="test_job_succecced_controller_task.yaml",
    )

    cjob.create()
    cjob.wait_for_state(goal_state="SUCCEEDED")

    kill_jobs([cjob])
示例#23
0
def test__preemption_tasks_reschedules_task(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        count = 0
        for t in p_job_a.get_tasks().values():
            # tasks should be enqueued back by the jobmanager and once
            # enqueued they should transition to PENDING state
            if t.state == task.PENDING:
                count += 1
        return count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(
            max_retry_attempts=100, sleep_time_sec=10
        ),
    )
    # starting the second job should change the entitlement calculation
    p_job_b.create()

    # 6 tasks should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # p_job_b should succeed
    p_job_b.wait_for_state(goal_state="SUCCEEDED")

    kill_jobs([p_job_a, p_job_b])
示例#24
0
def test__preemption_task_level(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job_preemption_override.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(
            max_retry_attempts=100, sleep_time_sec=10
        ),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    # odd instance ids should be preempted
    expected_preempted_tasks = set([1, 3, 5, 7, 9, 11])
    # even instance ids should be running
    expected_running_tasks = set([0, 2, 4, 6, 8, 10])

    preempted_task_set, running_task_set = set([]), set([])

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        preempted_task_set.clear()
        running_task_set.clear()
        preempted_count, running_count = 0, 0
        for t in p_job_a.get_tasks().values():
            # tasks should be KILLED since killOnPreempt is set to true
            if t.state == task.KILLED:
                preempted_count += 1
                preempted_task_set.add(t.instance_id)
            if t.state == task.RUNNING:
                running_count += 1
                running_task_set.add(t.instance_id)

        return running_count == 6 and preempted_count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(),
    )
    # starting the second job should change the entitlement calculation and
    # start preempting tasks from p_job_a
    p_job_b.create()

    # 6 tasks(odd instance ids) should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # check the preempted tasks and check instance ids should be odd.
    assert preempted_task_set == expected_preempted_tasks
    assert running_task_set == expected_running_tasks

    # wait for p_job_b to start running
    p_job_b.wait_for_state("RUNNING")

    kill_jobs([p_job_a, p_job_b])
示例#25
0
def test__preemption_spark_goalstate(respool_a, respool_b):
    p_job_a = Job(
        job_file="test_preemptible_job_preemption_policy.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(
            max_retry_attempts=100, sleep_time_sec=10
        ),
    )

    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # we should have all 12 tasks in running state
    def all_running():
        return all(t.state == 8 for t in p_job_a.get_tasks().values())

    p_job_a.wait_for_condition(all_running)

    preempted_task_set = {}

    # 6(6 CPUs worth) tasks from p_job_a should be preempted
    def task_preempted():
        count = 0
        for t in p_job_a.get_tasks().values():
            # tasks should be KILLED since killOnPreempt is set to true
            if t.state == task.KILLED:
                count += 1
                preempted_task_set[t] = True
        return count == 6

    p_job_b = Job(
        job_file="test_preemptible_job.yaml",
        pool=respool_b,
        config=IntegrationTestConfig(),
    )
    # starting the second job should change the entitlement calculation
    p_job_b.create()

    # 6 jobs should be preempted from job1 to make space for job2
    p_job_a.wait_for_condition(task_preempted)

    # check the preempted tasks and check the runtime info.
    for t in preempted_task_set:
        assert t.state == task.KILLED
        assert t.goal_state == task.PREEMPTING

    kill_jobs([p_job_a, p_job_b])
示例#26
0
def test_non_preemptible_job(respool_a):
    # start non-preemptible job using all of CPU reservation.
    np_job_a_1 = Job(
        job_file="test_non_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    np_job_a_1.create()
    np_job_a_1.wait_for_state(goal_state="RUNNING")

    # the resource pools CPU allocation should be equal to the reservation.
    assert np_job_a_1.pool.get_reservation(
        "cpu") == np_job_a_1.pool.get_allocation("cpu")

    # start another non-preemptible job which should not be admitted as all
    # the reservation(CPU) of the resource pool is used up.
    np_job_a_2 = Job(
        job_file="test_non_preemptible_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5),
    )
    np_job_a_2.create()
    np_job_a_2.wait_for_state(goal_state="PENDING")

    # start preemptible job which should start running.
    p_job_a = Job(
        job_file="test_job.yaml",
        pool=respool_a,
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    p_job_a.create()
    p_job_a.wait_for_state(goal_state="RUNNING")

    # stop the first non-preemptible job.
    np_job_a_1.stop()
    np_job_a_1.wait_for_state(goal_state="KILLED")

    # make sure the second one completes.
    np_job_a_2.wait_for_state(goal_state="RUNNING")

    kill_jobs([np_job_a_2, p_job_a])
示例#27
0
def test__dynamic_partition_pool_restrictions(peloton_client):
    # we start with shared=1, batch_reserved=2
    # delete batch_reserved so that its hosts go to "default"
    delete_host_pool(util.HOSTPOOL_BATCH_RESERVED)

    # setup 3 host-pools with 1 host each
    ensure_host_pool(util.HOSTPOOL_BATCH_RESERVED, 1)
    ensure_host_pool(util.HOSTPOOL_SHARED, 1)
    ensure_host_pool(util.HOSTPOOL_STATELESS, 1)

    hostToPool = dict()
    resp = list_host_pools()
    for pool in resp.pools:
        for h in pool.hosts:
            hostToPool[h] = pool.name

    # Job has two instances with 3 cpus each.
    # Only one instance will run.
    npjob = Job(
        client=peloton_client,
        job_file="test_non_preemptible_job.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    npjob.create()
    npjob.wait_for_state(goal_state='RUNNING')

    count = 0
    for t in npjob.get_tasks():
        if npjob.get_task(t).state_str == "PENDING":
            count = count + 1
        else:
            hostname = npjob.get_task(t).get_runtime().host
            assert hostToPool[hostname] == util.HOSTPOOL_BATCH_RESERVED

    assert count == 1

    # Stateless job has 4 instances with host limit 1
    # so only one instance will run
    sjob = Job(
        client=peloton_client,
        job_file="test_stateless_job_host_limit_1.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2),
    )
    sjob.create()
    sjob.wait_for_state(goal_state="RUNNING")

    count = 0
    for t in sjob.get_tasks():
        if sjob.get_task(t).state_str == "PENDING":
            count = count + 1
        else:
            hostname = sjob.get_task(t).get_runtime().host
            assert hostToPool[hostname] == util.HOSTPOOL_STATELESS

    assert count == 3

    # Preemptible batch job has 12 instances with 1 CPU each,
    # so 4 instances will run.
    pjob = Job(
        client=peloton_client,
        job_file="test_preemptible_job.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2),
    )
    pjob.create()
    pjob.wait_for_state(goal_state="RUNNING")

    count = 0
    for t in pjob.get_tasks():
        if pjob.get_task(t).state_str == "PENDING":
            count = count + 1
        else:
            hostname = pjob.get_task(t).get_runtime().host
            assert hostToPool[hostname] == util.HOSTPOOL_SHARED

    assert count == 8

    # Stop all jobs
    npjob.stop()
    sjob.stop()
    pjob.stop()
示例#28
0
def test_controller_task_limit_executor_can_run():
    # This tests the controller limit isn't applied to non-controller jobs.
    # 1. start controller cjob1 which uses all the controller limit
    # 2. start controller cjob2, make sure it remains pending.
    # 3. start non-controller job, make sure it succeeds.

    # job1 uses all the controller limit
    cjob1 = Job(job_file='test_controller_job.yaml',
                config=IntegrationTestConfig(
                    pool_file='test_respool_controller_limit.yaml'))

    cjob1.create()
    cjob1.wait_for_state(goal_state='RUNNING')

    # job2 should remain pending as job1 used the controller limit
    cjob2 = Job(job_file='test_controller_job.yaml',
                config=IntegrationTestConfig(
                    pool_file='test_respool_controller_limit.yaml'))
    cjob2.create()

    # sleep for 5 seconds to make sure job 2 has enough time
    time.sleep(5)

    # make sure job2 can't run
    cjob2.wait_for_state(goal_state='PENDING')

    # start a normal executor job
    job = Job(job_file='test_job.yaml',
              config=IntegrationTestConfig(
                  pool_file='test_respool_controller_limit.yaml'))
    job.create()

    # make sure job can run and finish
    job.wait_for_state(goal_state='SUCCEEDED')

    kill_jobs([cjob1, cjob2])
示例#29
0
def test__create_2_stateless_jobs_with_task_to_task_anti_affinity_between_jobs(
):  # noqa
    label_key = "job.name"

    jobs = []
    for i in range(2):
        job = Job(
            job_file="test_stateless_job.yaml",
            config=IntegrationTestConfig(
                max_retry_attempts=100,
                pool_file='test_stateless_respool.yaml',
            ),
            options=[
                with_labels({label_key: "peloton_stateless_job%s" % i}),
                with_job_name("TestPelotonDockerJob_Stateless" + repr(i)),
                with_instance_count(1),
            ],
        )
        job.job_config.defaultConfig.constraint.CopyFrom(
            task_pb2.Constraint(
                type=2,
                andConstraint=task_pb2.AndConstraint(constraints=[
                    task_pb2.Constraint(
                        type=1,
                        labelConstraint=task_pb2.LabelConstraint(
                            kind=1,
                            condition=2,
                            requirement=0,
                            label=peloton_pb2.Label(
                                # Tasks of my own job
                                key="job.name",
                                value="peloton_stateless_job%s" % i,
                            ),
                        ),
                    ),
                    task_pb2.Constraint(
                        type=1,
                        labelConstraint=task_pb2.LabelConstraint(
                            kind=1,
                            condition=2,
                            requirement=0,
                            label=peloton_pb2.Label(
                                # Avoid tasks of the other job
                                key="job.name",
                                value="peloton_stateless_job%s" %
                                ((i + 1) % 2),
                            ),
                        ),
                    ),
                ]),
            ))
        jobs.append(job)

    for job in jobs:
        job.create()
        time.sleep(1)

    # Determine if tasks run on different hosts
    hosts = set()
    for job in jobs:
        job.wait_for_state(goal_state="RUNNING")
        for _, task in job.get_tasks().iteritems():
            task_info = task.get_info()
            hosts = hosts.union(set({task_info.runtime.host}))

    kill_jobs(jobs)

    # Ensure that the tasks run on 2 different hosts
    assert len(hosts) == 2
示例#30
0
def test_controller_task_limit():
    # This tests the controller limit of a resource pool. Once it is fully
    # allocated by a controller task, subsequent tasks can't be admitted.
    # 1. start controller job1 which uses all the controller limit
    # 2. start controller job2, make sure it remains pending.
    # 3. kill  job1, make sure job2 starts running.

    # job1 uses all the controller limit
    job1 = Job(job_file='test_controller_job.yaml',
               config=IntegrationTestConfig(
                   pool_file='test_respool_controller_limit.yaml'))

    job1.create()
    job1.wait_for_state(goal_state='RUNNING')

    # job2 should remain pending as job1 used the controller limit
    job2 = Job(job_file='test_controller_job.yaml',
               config=IntegrationTestConfig(
                   pool_file='test_respool_controller_limit.yaml'))
    job2.create()

    # sleep for 5 seconds to make sure job 2 has enough time
    time.sleep(5)

    # make sure job2 can't run
    job2.wait_for_state(goal_state='PENDING')

    # stop job1
    job1.stop()
    job1.wait_for_state(goal_state='KILLED')

    # make sure job2 starts running
    job2.wait_for_state(goal_state='RUNNING')

    kill_jobs([job2])