示例#1
0
def test__restart_killed_job():
    job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml")
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = job.query_pods()

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    job.restart(in_place=False)

    job.wait_for_all_pods_running()

    new_pod_infos = job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
示例#2
0
def test__in_place_update_success_rate():
    stateless_job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml")
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config("test_stateless_job_spec_k8s.yaml")
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)
    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    assert count == 0
示例#3
0
def test_stop_running_job_with_active_update_remove_instances(in_place):
    stateless_job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    assert len(stateless_job.query_pods()) == 5

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLING_FORWARD")

    stateless_job.stop()
    update.wait_for_state(goal_state="SUCCEEDED")
    assert stateless_job.get_spec().instance_count == 3