예제 #1
0
    def test__restart_restart_jobmgr(self, failure_tester, in_place):
        '''
        Restart job-manager leader while stateless job is restarted
        and verify if the tasks in the job are changed
        '''
        stateless_job = failure_tester.stateless_job()
        stateless_job.create()
        stateless_job.wait_for_state(goal_state="RUNNING")

        old_pod_infos = stateless_job.query_pods()
        stateless_job.restart(in_place=in_place)

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED")

        stateless_job.wait_for_all_pods_running()

        new_pod_infos = stateless_job.query_pods()
        assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #2
0
def test__create_update_stopped_job(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    old_pod_states = set()
    for pod_info in old_pod_infos:
        old_pod_states.add(pod_info.spec.pod_name.value)

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    stateless_job.start()
    update.wait_for_state(goal_state="SUCCEEDED")
    stateless_job.wait_for_state(goal_state="RUNNING")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)

    # Only new instances should be RUNNING
    for pod_info in new_pod_infos:
        if pod_info.spec.pod_name.value in new_pod_infos:
            assert pod_info.status.state == pod_pb2.POD_STATE_KILLED
        else:
            assert pod_info.status.state == pod_pb2.POD_STATE_RUNNING
예제 #3
0
    def test__create_update_restart_jobmgr(self, failure_tester, in_place):
        '''
        Restart job-manager leader while stateless job is updated
        and verify if the tasks in the job are changed
        '''
        stateless_job = failure_tester.stateless_job()
        stateless_job.create()
        stateless_job.wait_for_state(goal_state="RUNNING")

        old_pod_infos = stateless_job.query_pods()
        old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
        update = failure_tester.stateless_update(
            stateless_job,
            updated_job_file="test_update_stateless_job_update_and_add_instances_spec.yaml",
            batch_size=1,
        )
        update.create(in_place=in_place)

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        update.wait_for_state(goal_state="SUCCEEDED")
        new_pod_infos = stateless_job.query_pods()
        new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

        assert len(old_pod_infos) == 3
        assert len(new_pod_infos) == 5
        assert_pod_id_changed(old_pod_infos, new_pod_infos)
        assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #4
0
def test__restart_restart_jobmgr(stateless_job, jobmgr, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    stateless_job.restart(in_place=in_place)

    jobmgr.restart()
    stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED")

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #5
0
def test__restart_pods(canary_job):
    old_pod_infos = canary_job.query_pods()

    # TODO add back batch size after API update in peloton client
    # stateless_job.restart(batch_size=1)
    canary_job.restart()
    canary_job.wait_for_workflow_state(goal_state='SUCCEEDED')

    canary_job.wait_for_all_pods_running()

    new_pod_infos = canary_job.query_pods()
    # restart should kill and start already running instances
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #6
0
def test__restart_running_job(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()

    stateless_job.restart(in_place=in_place)
    stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED")

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    # restart should kill and start already running instances
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #7
0
def test__start_killed_job(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")

    stateless_job.start()
    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #8
0
def test__restart_restart_jobmgr(stateless_job, jobmgr):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()
    stateless_job.restart()

    jobmgr.restart()
    stateless_job.wait_for_workflow_state(goal_state='SUCCEEDED')

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #9
0
def test__create_update(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(stateless_job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #10
0
def test__restart_running_job_with_batch_size(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()

    stateless_job.restart(batch_size=1)
    stateless_job.wait_for_workflow_state(goal_state='SUCCEEDED')

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    # restart should kill and start already running instances
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #11
0
def test__restart_killed_job_with_batch_size(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state='KILLED')
    old_pod_infos = stateless_job.query_pods()

    stateless_job.restart(batch_size=1)

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #12
0
def test__restart_killed_job():
    job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml")
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = job.query_pods()

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    job.restart(in_place=False)

    job.wait_for_all_pods_running()

    new_pod_infos = job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #13
0
def test__restart_killed_job_with_batch_size(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")
    old_pod_infos = stateless_job.query_pods()

    # TODO add back batch size after API update in peloton client
    # stateless_job.restart(batch_size=1)
    stateless_job.restart(in_place=in_place)

    stateless_job.wait_for_all_pods_running()

    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
예제 #14
0
def test__create_update_with_batch_size(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_SPEC,
        batch_size=1,
    )
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #15
0
def test__create_update_update_and_add_instances(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
    )
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #16
0
def test__pause_resume_initialized_update(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(stateless_job,
                             batch_size=1,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC)
    update.create(in_place=in_place)
    # immediately pause the update, so the update may still be INITIALIZED
    update.pause()
    update.wait_for_state(goal_state="PAUSED")
    update.resume()
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #17
0
def test__create_update_update_and_add_instances_with_batch(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #18
0
def test__pause_resume__update(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(stateless_job,
                             batch_size=1,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC)
    update.create(in_place=in_place)
    # sleep for 1 sec so update can begin to roll forward
    time.sleep(1)
    update.pause()
    update.wait_for_state(goal_state="PAUSED")
    update.resume()
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #19
0
def test__create_update_update_start_paused(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        start_paused=True,
    )
    update.create()
    update.wait_for_state(goal_state="PAUSED")
    update.resume()
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
예제 #20
0
def test__create_multiple_consecutive_updates(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update1 = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update1.create()
    update2 = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update2.create()
    update2.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)