def test__placement_exclusive_job(exclusive_host): excl_constraint = pod_pb2.Constraint( type=1, # Label constraint label_constraint=pod_pb2.LabelConstraint( kind=2, # Host condition=2, # Equal requirement=1, label=peloton_pb2_v1alpha.Label(key="peloton/exclusive", value="exclusive-test-label"), ), ) # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously on # a single exclusive host job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.default_spec.constraint.CopyFrom(excl_constraint) job.job_spec.instance_count = 6 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=4) job.stop() job.wait_for_terminated() # check that all of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" in s.status.host
def test__revocable_tasks_move_to_revocable_queue(): revocable_job1 = StatelessJob(job_file='test_stateless_job_revocable_spec.yaml') revocable_job1.create() revocable_job1.wait_for_state(goal_state='RUNNING') revocable_job1.wait_for_all_pods_running() # 1 task is running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job2.job_spec.instance_count): pod_state = revocable_job2.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 1 revocable_job2 = StatelessJob( job_file='test_stateless_job_revocable_slack_limit_spec.yaml') revocable_job2.create() # sleep for 5 seconds to make sure job has enough time time.sleep(5) revocable_job2.wait_for_condition(partial_tasks_running) non_revocable_job = StatelessJob(job_file='test_stateless_job_spec.yaml') non_revocable_job.create() non_revocable_job.wait_for_state('RUNNING') non_revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr revocable_job1.stop() revocable_job2.stop() non_revocable_job.stop() revocable_job1.wait_for_terminated() revocable_job2.wait_for_terminated() non_revocable_job.wait_for_terminated()
def test__in_place_kill_job_release_host(): job1 = StatelessJob( job_file="test_stateless_job_spec.yaml", ) job1.create() job1.wait_for_state(goal_state="RUNNING") job2 = StatelessJob( job_file="test_stateless_job_spec.yaml", ) job2.create() job2.wait_for_state(goal_state="RUNNING") update1 = StatelessUpdate(job1, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update1.create(in_place=True) # stop the update job1.stop() update2 = StatelessUpdate(job2, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update2.create() # both updates should complete update1.wait_for_state(goal_state="SUCCEEDED") update2.wait_for_state(goal_state="SUCCEEDED")
def test__simple_revocable_batch_and_stateless_colocate(): revocable_stateless_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml") revocable_stateless_job.create() revocable_stateless_job.wait_for_state(goal_state="RUNNING") revocable_stateless_job.wait_for_all_pods_running() non_revocable_stateless_job = StatelessJob( job_file="test_stateless_job_spec.yaml") non_revocable_stateless_job.create() non_revocable_stateless_job.wait_for_state(goal_state="RUNNING") non_revocable_stateless_job.wait_for_all_pods_running() revocable_batch_job = Job( job_file="test_job_revocable.yaml", config=IntegrationTestConfig(pool_file='test_stateless_respool.yaml'), ) revocable_batch_job.create() revocable_batch_job.wait_for_state(goal_state="RUNNING") non_revocable_batch_job = Job( job_file="test_job.yaml", config=IntegrationTestConfig(pool_file='test_stateless_respool.yaml'), ) non_revocable_batch_job.create() non_revocable_batch_job.wait_for_state(goal_state="RUNNING") revocable_batch_job.wait_for_state() non_revocable_batch_job.wait_for_state()
def test__delete_running_job_without_force_flag(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") try: job.delete() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.ABORTED return raise Exception("job in non-terminal state error not received")
def test__create_update_to_unset_health_check(): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') update = StatelessUpdate(job, updated_job_file=UPDATE_STATELESS_JOB_SPEC, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='SUCCEEDED')
def test__create_update_to_disable_health_check(): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') job.job_spec.default_spec.containers[0].liveness_check.enabled = False update = StatelessUpdate(job, updated_job_spec=job.job_spec, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='SUCCEEDED')
def test__delete_running_job_with_force_flag(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.delete(force_delete=True) time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def test__health_check_detects_healthy_tasks(): job = StatelessJob( job_file='test_stateless_job_successful_health_check_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state='RUNNING') def task_has_healthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == 'HEALTHY': return True job.wait_for_condition(task_has_healthy_events)
def test__kill_sla_violated_job(): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Kill job and wait for the job to reach KILLED state """ job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", ) job.job_spec.instance_count = 5 job.create() job.wait_for_all_pods_running(num_pods=3) job.stop() job.wait_for_state(goal_state='KILLED')
def test__failed_task_throttled_by_exponential_backoff(): job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') time.sleep(40) pod_events = job.get_pod(0).get_pod_events() # if throttle is effective, the task should not create many # pod events. Otherwise it can generate many pod events, during # the time window pod_id = pod_events[0].pod_id.value run_id = int(pod_id[pod_id.rindex('-') + 1:]) assert 1 < run_id < 20
def test__restart_killed_job(): job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml") job.create() job.wait_for_state(goal_state="RUNNING") old_pod_infos = job.query_pods() job.stop() job.wait_for_state(goal_state="KILLED") job.restart(in_place=False) job.wait_for_all_pods_running() new_pod_infos = job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__delete_job_bad_version(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") try: job.delete(entity_version="1-2-3") except grpc.RpcError as e: assert e.code() == grpc.StatusCode.ABORTED assert INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() return raise Exception("entity version mismatch error not received")
def test__health_check_detects_unhealthy_tasks(): job = StatelessJob( job_file="test_stateless_job_failed_health_check_spec.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state="RUNNING") def task_has_unhealthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == "HEALTH_STATE_UNHEALTHY": return True job.wait_for_condition(task_has_unhealthy_events)
def test_update_killed_job(in_place): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") update = StatelessUpdate( job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") assert job.get_spec().instance_count == 3 assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED
def test__delete_killed_job(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") job.delete() time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def test__stop_nonrevocable_job_to_free_resources_for_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state("RUNNING") non_revocable_job2 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state("RUNNING") non_revocable_job1.wait_for_all_pods_running() non_revocable_job2.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() # no tasks should be running def no_task_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 # give job 5 seconds to run, even after that no tasks should be running time.sleep(5) revocable_job.wait_for_condition(no_task_running) # stop non_revocable job to free up resources for revocable job non_revocable_job2.stop() non_revocable_job2.wait_for_terminated() # After non_revocable job is killed, all revocable tasks should be running revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr non_revocable_job1.stop() revocable_job.stop() non_revocable_job1.wait_for_terminated() revocable_job.wait_for_terminated()
def test__failed_task_automatically_restart(): job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') old_pod_id = job.get_pod(0).get_pod_status().pod_id.value def job_not_running(): return job.get_status().state != 'JOB_STATE_RUNNING' job.wait_for_condition(job_not_running) def pod_id_changed(): new_pod_id = job.get_pod(0).get_pod_status().pod_id.value return old_pod_id != new_pod_id job.wait_for_condition(pod_id_changed)
def test__placement_non_exclusive_job(exclusive_host): # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously # on 2 non-exclusive hosts job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.instance_count = 10 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=5) job.stop() job.wait_for_terminated() # check that none of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" not in s.status.host
def test_stop_running_job_with_active_update_remove_instances(in_place): stateless_job = StatelessJob( job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") assert len(stateless_job.query_pods()) == 5 update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLING_FORWARD") stateless_job.stop() update.wait_for_state(goal_state="SUCCEEDED") assert stateless_job.get_spec().instance_count == 3
def test__create_update_to_change_health_check_config(in_place): job = StatelessJob( job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100), ) job.job_spec.default_spec.containers[0].liveness_check.enabled = False job.create() job.wait_for_state(goal_state="RUNNING") job.job_spec.default_spec.containers[ 0].liveness_check.initial_interval_secs = 2 update = StatelessUpdate( job, updated_job_spec=job.job_spec, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED")
def test__create_update_to_set_health_check(in_place): job = StatelessJob( job_file=UPDATE_STATELESS_JOB_SPEC, config=IntegrationTestConfig( max_retry_attempts=100, pool_file='test_stateless_respool.yaml', ), ) job.create() job.wait_for_state(goal_state="RUNNING") update = StatelessUpdate( job, updated_job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED")
def test__revocable_job_slack_limit(): revocable_job = StatelessJob( job_file="test_stateless_job_revocable_slack_limit_spec.yaml") revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") # 2 tasks are running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 2 revocable_job.wait_for_condition(partial_tasks_running) # cleanup job from jobmgr revocable_job.stop() revocable_job.wait_for_terminated()
def test__create_revocable_job(): revocable_job1 = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml") revocable_job1.create() revocable_job1.wait_for_state(goal_state="RUNNING") revocable_job1.wait_for_all_pods_running() revocable_job2 = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml") revocable_job2.create() revocable_job2.wait_for_state(goal_state="RUNNING") revocable_job2.wait_for_all_pods_running() non_revocable_job = StatelessJob( job_file="test_stateless_job_cpus_large_spec.yaml") non_revocable_job.create() non_revocable_job.wait_for_state(goal_state="RUNNING") non_revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr revocable_job1.stop() revocable_job2.stop() non_revocable_job.stop() revocable_job1.wait_for_terminated() revocable_job2.wait_for_terminated() non_revocable_job.wait_for_terminated()
def test__preempt_revocable_job_to_run_non_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state(goal_state="RUNNING") non_revocable_job1.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") revocable_job.wait_for_all_pods_running() # launch second non-revocable job which will pre-empt revocable job non_revocable_job2 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state(goal_state="RUNNING") non_revocable_job2.wait_for_all_pods_running() # no revocable job tasks should be running def zero_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 revocable_job.wait_for_condition(zero_tasks_running) revocable_job.stop() non_revocable_job1.stop() non_revocable_job2.stop() revocable_job.wait_for_terminated() non_revocable_job1.wait_for_terminated() non_revocable_job2.wait_for_terminated()