def test__delete_running_job_with_force_flag(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.delete(force_delete=True) time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def test__kill_sla_violated_job(): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Kill job and wait for the job to reach KILLED state """ job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", ) job.job_spec.instance_count = 5 job.create() job.wait_for_all_pods_running(num_pods=3) job.stop() job.wait_for_state(goal_state='KILLED')
def stateless_job(request, peloton_client): job = StatelessJob(client=peloton_client, ) if util.minicluster_type() == "k8s": job = StatelessJob( job_file="test_stateless_job_spec_k8s.yaml", client=peloton_client, ) # teardown def kill_stateless_job(): print("\nstopping stateless job") job.stop() request.addfinalizer(kill_stateless_job) return job
def wait_for_deletion(client, timeout_secs): """ Wait for job deletion to complete. """ deadline = time.time() + timeout_secs while time.time() < deadline: try: jobs = [ StatelessJob(job_id=s.job_id.value, client=client) for s in list_jobs() ] if len(jobs) == 0: return time.sleep(2) except grpc.RpcError as e: # Catch "not-found" error here because QueryJobs endpoint does # two db queries in sequence: "QueryJobs" and "GetUpdate". # However, when we delete a job, updates are deleted first, # there is a slight chance QueryJobs will fail to query the # update, returning "not-found" error. if e.code() == grpc.StatusCode.NOT_FOUND: time.sleep(2) continue raise assert False, "timed out waiting for jobs to be deleted"
def patch_jobs(active_jobs=None, desired_jobs=None): """ patch jobs check current state of the job and applies desired goal state for the job. It can yield to create a job or updating a job. """ jobs = {} for job_name, job_spec in desired_jobs.items(): if job_name in active_jobs.keys(): # job exists -> update to desired state patch_job(active_jobs[job_name], job_spec) jobs[job_name] = active_jobs[job_name].get_job_id() else: # job does not exist -> create job = StatelessJob(job_config=job_spec, config=IntegrationTestConfig( pool_file=RESPOOL_FILE_NAME, max_retry_attempts=MAX_RETRY_ATTEMPTS)) job.create() time.sleep(10) job.wait_for_all_pods_running() jobs[job_name] = job.get_job_id() # TODO: Kill any undesired active job running in the canary cluster return jobs
def test_stop_running_job_with_active_update_remove_instances(in_place): stateless_job = StatelessJob( job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") assert len(stateless_job.query_pods()) == 5 update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLING_FORWARD") stateless_job.stop() update.wait_for_state(goal_state="SUCCEEDED") assert stateless_job.get_spec().instance_count == 3
def test__delete_running_job_without_force_flag(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") try: job.delete() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.ABORTED return raise Exception("job in non-terminal state error not received")
def test__delete_sla_violated_job(): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Force delete the job and verify that the job is deleted """ job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", ) job.job_spec.instance_count = 5 job.create() job.wait_for_all_pods_running(num_pods=3) job.delete(force_delete=True) time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def test__health_check_detects_healthy_tasks(): job = StatelessJob( job_file='test_stateless_job_successful_health_check_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state='RUNNING') def task_has_healthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == 'HEALTHY': return True job.wait_for_condition(task_has_healthy_events)
def test__delete_initialized_job_with_force_flag(): job = StatelessJob() job.create() # the job might have transitioned to INITIALIZED/PENDING # since there is no way to fine control the job transitions job.delete(force_delete=True) time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def get_unique_job(request): """ Finds a unique job to run for a test. Job selected in random across multiple test suite runs. """ while True: job = None FILE_LOCK.acquire() try: pytest.test_job_map = read_from_file(TEST_JOB_MAP_FILE) pytest.job_in_use = read_from_file(JOB_IN_USE_FILE) job_list = list(pytest.jobs.keys()) random.shuffle(job_list) for j in job_list: test_name = request.node.name id = test_name.split("[")[0] + "_" + j # check if test && job are already matched if id in pytest.test_job_map or j in pytest.job_in_use: continue pytest.test_job_map[id] = "" pytest.job_in_use[j] = "" write_to_file(TEST_JOB_MAP_FILE, pytest.test_job_map) write_to_file(JOB_IN_USE_FILE, pytest.job_in_use) log.info( "test_job_mapping:: test_name: %s, map_id: %s", test_name, id, ) # create deep copy for job job = StatelessJob( job_id=pytest.jobs[j], config=IntegrationTestConfig( pool_file=RESPOOL_FILE_NAME, max_retry_attempts=MAX_RETRY_ATTEMPTS, ), ) break finally: FILE_LOCK.release() if job is not None: break time.sleep(10) return job
def cleanup_jobs(client, timeout_secs=20): """ Calls peloton API to delete all currently running jobs """ jobs = [ StatelessJob(job_id=s.job_id.value, client=client) for s in list_jobs() ] for job in jobs: job.delete(force_delete=True) wait_for_deletion(client, timeout_secs)
def test__health_check_detects_unhealthy_tasks(): job = StatelessJob( job_file="test_stateless_job_failed_health_check_spec.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) job.job_spec.instance_count = 1 job.create() job.wait_for_state(goal_state="RUNNING") def task_has_unhealthy_events(): for pod_event in job.get_pod(0).get_pod_events(): if pod_event.healthy == "HEALTH_STATE_UNHEALTHY": return True job.wait_for_condition(task_has_unhealthy_events)
def stop_jobs(client): ''' Calls peloton API to terminate all batch jobs and stateless jobs ''' # obtain a list of jobs from all resource pools and terminate them jobs = list_jobs() for job in jobs: job = StatelessJob(client=client, job_id=job.job_id.value) job.config.max_retry_attempts = 100 job.stop() job.wait_for_terminated()
def test__failed_task_throttled_by_exponential_backoff(): job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') time.sleep(40) pod_events = job.get_pod(0).get_pod_events() # if throttle is effective, the task should not create many # pod events. Otherwise it can generate many pod events, during # the time window pod_id = pod_events[0].pod_id.value run_id = int(pod_id[pod_id.rindex('-') + 1:]) assert 1 < run_id < 20
def test__create_update_to_unset_health_check(): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') update = StatelessUpdate(job, updated_job_file=UPDATE_STATELESS_JOB_SPEC, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='SUCCEEDED')
def patch_jobs(active_jobs=None, desired_jobs=None): """ patch jobs check current state of the job and applies desired goal state for the job. It can yield to create a job or updating a job. """ jobs = {} for job_name, job_spec in desired_jobs.items(): if job_name in active_jobs.keys(): j = active_jobs[job_name] # failfast is not None then do not run canary test # until dirty jobs are restored manually. if os.getenv("FAILFAST") == "NO": # job exists -> update to desired state patch_job(j, job_spec) jobs[job_name] = j.get_job_id() else: # if job update diff has non-nil result means that previous # canary test run failed and we want more runs to block # until issue is manually debugged and state is restored. job_spec.respool_id.value = j.get_spec().respool_id.value resp = j.get_replace_job_diff(job_spec=job_spec) print resp if len(resp.instances_removed) > 0 or \ len(resp.instances_updated) > 0 or \ len(resp.instances_added) > 0: pytest.exit( "canary test run was aborted, since jobs are dirty!!") jobs[job_name] = j.get_job_id() else: # job does not exist -> create job = StatelessJob( job_config=job_spec, config=IntegrationTestConfig( pool_file=RESPOOL_FILE_NAME, max_retry_attempts=MAX_RETRY_ATTEMPTS, ), ) job.create() time.sleep(10) job.wait_for_all_pods_running() jobs[job_name] = job.get_job_id() # TODO: Kill any undesired active job running in the canary cluster return jobs
def test__create_update_to_disable_health_check(): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') job.job_spec.default_spec.containers[0].liveness_check.enabled = False update = StatelessUpdate(job, updated_job_spec=job.job_spec, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='SUCCEEDED')
def test__in_place_update_success_rate(): stateless_job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml") stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config("test_stateless_job_spec_k8s.yaml") updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 assert count == 0
def test__create_update_to_set_health_check(in_place): job = StatelessJob( job_file=UPDATE_STATELESS_JOB_SPEC, config=IntegrationTestConfig( max_retry_attempts=100, pool_file='test_stateless_respool.yaml', ), ) job.create() job.wait_for_state(goal_state="RUNNING") update = StatelessUpdate( job, updated_job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED")
def test__create_update_to_change_health_check_config(in_place): job = StatelessJob( job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC, config=IntegrationTestConfig(max_retry_attempts=100), ) job.job_spec.default_spec.containers[0].liveness_check.enabled = False job.create() job.wait_for_state(goal_state="RUNNING") job.job_spec.default_spec.containers[ 0].liveness_check.initial_interval_secs = 2 update = StatelessUpdate( job, updated_job_spec=job.job_spec, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED")
def test__get_job_update_details__filter_non_update_workflow(client): """ test getJobUpdateDetails endpoint for filtering non-update workflows """ req1 = get_job_update_request("test_dc_labrat_large_job.yaml") req1.settings.updateGroupSize = 10 req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml") req2.settings.updateGroupSize = 10 # start a regular update job_key = start_job_update(client, req1, "start job update test/dc/labrat_large_job") # trigger an unexpected restart through peloton api jobs = list_jobs() assert len(jobs) == 1 job = StatelessJob(job_id=jobs[0].job_id.value) job.restart(batch_size=10) job.wait_for_workflow_state(goal_state="SUCCEEDED") # wait for restart # start a new update start_job_update(client, req2, "start job update test/dc/labrat_large_job") # verify getJobUpdateDetails response res = client.get_job_update_details(None, api.JobUpdateQuery(role=job_key.role)) assert len(res.detailsList) == 2 for i, detail in enumerate(res.detailsList): if i == 0: assert len(detail.update.instructions.initialState) > 0 for initial in detail.update.instructions.initialState: assert initial.task.metadata, 'Expect metadata to be present' else: assert len(detail.update.instructions.initialState) == 0
def test__in_place_kill_job_release_host(): job1 = StatelessJob(job_file="test_stateless_job_spec.yaml", ) job1.create() job1.wait_for_state(goal_state="RUNNING") job2 = StatelessJob(job_file="test_stateless_job_spec.yaml", ) job2.create() job2.wait_for_state(goal_state="RUNNING") update1 = StatelessUpdate(job1, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update1.create(in_place=True) # stop the update job1.stop() update2 = StatelessUpdate(job2, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update2.create() # both updates should complete update1.wait_for_state(goal_state="SUCCEEDED") update2.wait_for_state(goal_state="SUCCEEDED")
def test__delete_job_bad_version(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") try: job.delete(entity_version="1-2-3") except grpc.RpcError as e: assert e.code() == grpc.StatusCode.ABORTED assert INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() return raise Exception("entity version mismatch error not received")
def test__delete_killed_job(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") job.delete() time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def _list_jobs(): return [StatelessJob(job_id=s.job_id.value) for s in list_jobs()]
def test__placement_exclusive_job(exclusive_host): excl_constraint = pod_pb2.Constraint( type=1, # Label constraint label_constraint=pod_pb2.LabelConstraint( kind=2, # Host condition=2, # Equal requirement=1, label=peloton_pb2_v1alpha.Label(key="peloton/exclusive", value="exclusive-test-label"), ), ) # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously on # a single exclusive host job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.default_spec.constraint.CopyFrom(excl_constraint) job.job_spec.instance_count = 6 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=4) job.stop() job.wait_for_terminated() # check that all of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" in s.status.host
def test__placement_non_exclusive_job(exclusive_host): # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously # on 2 non-exclusive hosts job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.instance_count = 10 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=5) job.stop() job.wait_for_terminated() # check that none of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" not in s.status.host
def test__failed_task_automatically_restart(): job = StatelessJob( job_file="test_stateless_job_exit_with_err_spec.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) job.create() job.wait_for_state(goal_state="RUNNING") old_pod_id = job.get_pod(0).get_pod_status().pod_id.value def job_not_running(): return job.get_status().state != "JOB_STATE_RUNNING" job.wait_for_condition(job_not_running) def pod_id_changed(): new_pod_id = job.get_pod(0).get_pod_status().pod_id.value return old_pod_id != new_pod_id job.wait_for_condition(pod_id_changed)
def test_update_killed_job(in_place): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") update = StatelessUpdate( job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") assert job.get_spec().instance_count == 3 assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED