def test__revocable_tasks_move_to_revocable_queue(): revocable_job1 = StatelessJob(job_file='test_stateless_job_revocable_spec.yaml') revocable_job1.create() revocable_job1.wait_for_state(goal_state='RUNNING') revocable_job1.wait_for_all_pods_running() # 1 task is running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job2.job_spec.instance_count): pod_state = revocable_job2.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 1 revocable_job2 = StatelessJob( job_file='test_stateless_job_revocable_slack_limit_spec.yaml') revocable_job2.create() # sleep for 5 seconds to make sure job has enough time time.sleep(5) revocable_job2.wait_for_condition(partial_tasks_running) non_revocable_job = StatelessJob(job_file='test_stateless_job_spec.yaml') non_revocable_job.create() non_revocable_job.wait_for_state('RUNNING') non_revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr revocable_job1.stop() revocable_job2.stop() non_revocable_job.stop() revocable_job1.wait_for_terminated() revocable_job2.wait_for_terminated() non_revocable_job.wait_for_terminated()
def test__health_check_detects_healthy_tasks(): job = StatelessJob( job_file='test_stateless_job_successful_health_check_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state='RUNNING') def task_has_healthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == 'HEALTHY': return True job.wait_for_condition(task_has_healthy_events)
def test__health_check_detects_unhealthy_tasks(): job = StatelessJob( job_file="test_stateless_job_failed_health_check_spec.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state="RUNNING") def task_has_unhealthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == "HEALTH_STATE_UNHEALTHY": return True job.wait_for_condition(task_has_unhealthy_events)
def test__stop_nonrevocable_job_to_free_resources_for_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state("RUNNING") non_revocable_job2 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state("RUNNING") non_revocable_job1.wait_for_all_pods_running() non_revocable_job2.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() # no tasks should be running def no_task_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 # give job 5 seconds to run, even after that no tasks should be running time.sleep(5) revocable_job.wait_for_condition(no_task_running) # stop non_revocable job to free up resources for revocable job non_revocable_job2.stop() non_revocable_job2.wait_for_terminated() # After non_revocable job is killed, all revocable tasks should be running revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr non_revocable_job1.stop() revocable_job.stop() non_revocable_job1.wait_for_terminated() revocable_job.wait_for_terminated()
def test__failed_task_automatically_restart(): job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') old_pod_id = job.get_pod(0).get_pod_status().pod_id.value def job_not_running(): return job.get_status().state != 'JOB_STATE_RUNNING' job.wait_for_condition(job_not_running) def pod_id_changed(): new_pod_id = job.get_pod(0).get_pod_status().pod_id.value return old_pod_id != new_pod_id job.wait_for_condition(pod_id_changed)
def test__revocable_job_slack_limit(): revocable_job = StatelessJob( job_file="test_stateless_job_revocable_slack_limit_spec.yaml") revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") # 2 tasks are running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 2 revocable_job.wait_for_condition(partial_tasks_running) # cleanup job from jobmgr revocable_job.stop() revocable_job.wait_for_terminated()
def test__preempt_revocable_job_to_run_non_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state(goal_state="RUNNING") non_revocable_job1.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") revocable_job.wait_for_all_pods_running() # launch second non-revocable job which will pre-empt revocable job non_revocable_job2 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state(goal_state="RUNNING") non_revocable_job2.wait_for_all_pods_running() # no revocable job tasks should be running def zero_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 revocable_job.wait_for_condition(zero_tasks_running) revocable_job.stop() non_revocable_job1.stop() non_revocable_job2.stop() revocable_job.wait_for_terminated() non_revocable_job1.wait_for_terminated() non_revocable_job2.wait_for_terminated()