def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines): # Tests that a if task is deleted which is in READY state in resource # manager and if is re-enqueued succeeds. # stop the placement engines to keep the tasks in READY state placement_engines.stop() # decorate the client to add peloton private API stubs c = with_private_stubs(Client()) # create long running job with 2 instances long_running_job = Job( job_file='long_running_job.yaml', options=[ with_instance_count(2), ], client=c, ) long_running_job.create() long_running_job.wait_for_state(goal_state='PENDING') task = long_running_job.get_task(0) # wait for task to reach READY task.wait_for_pending_state(goal_state='READY') # kill the task task.stop() # re-enqueue the task task.start() # gentlemen, start your (placement) engines placement_engines.start() def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == 'RUNNING' long_running_job.wait_for_condition(wait_for_instance_to_run)
def test__dynamic_partition_pool_restrictions(peloton_client): # we start with shared=1, batch_reserved=2 # delete batch_reserved so that its hosts go to "default" delete_host_pool(util.HOSTPOOL_BATCH_RESERVED) # setup 3 host-pools with 1 host each ensure_host_pool(util.HOSTPOOL_BATCH_RESERVED, 1) ensure_host_pool(util.HOSTPOOL_SHARED, 1) ensure_host_pool(util.HOSTPOOL_STATELESS, 1) hostToPool = dict() resp = list_host_pools() for pool in resp.pools: for h in pool.hosts: hostToPool[h] = pool.name # Job has two instances with 3 cpus each. # Only one instance will run. npjob = Job( client=peloton_client, job_file="test_non_preemptible_job.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) npjob.create() npjob.wait_for_state(goal_state='RUNNING') count = 0 for t in npjob.get_tasks(): if npjob.get_task(t).state_str == "PENDING": count = count + 1 else: hostname = npjob.get_task(t).get_runtime().host assert hostToPool[hostname] == util.HOSTPOOL_BATCH_RESERVED assert count == 1 # Stateless job has 4 instances with host limit 1 # so only one instance will run sjob = Job( client=peloton_client, job_file="test_stateless_job_host_limit_1.yaml", config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2), ) sjob.create() sjob.wait_for_state(goal_state="RUNNING") count = 0 for t in sjob.get_tasks(): if sjob.get_task(t).state_str == "PENDING": count = count + 1 else: hostname = sjob.get_task(t).get_runtime().host assert hostToPool[hostname] == util.HOSTPOOL_STATELESS assert count == 3 # Preemptible batch job has 12 instances with 1 CPU each, # so 4 instances will run. pjob = Job( client=peloton_client, job_file="test_preemptible_job.yaml", config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2), ) pjob.create() pjob.wait_for_state(goal_state="RUNNING") count = 0 for t in pjob.get_tasks(): if pjob.get_task(t).state_str == "PENDING": count = count + 1 else: hostname = pjob.get_task(t).get_runtime().host assert hostToPool[hostname] == util.HOSTPOOL_SHARED assert count == 8 # Stop all jobs npjob.stop() sjob.stop() pjob.stop()
def test__tasks_reserve_execution(hostreservepool, peloton_client): p_job_median = Job( client=peloton_client, job_file='test_hostreservation_job_median.yaml', pool=hostreservepool, config=IntegrationTestConfig( max_retry_attempts=100, sleep_time_sec=1), ) p_job_median.create() p_job_median.wait_for_state(goal_state='RUNNING') # we should have all 3 tasks in running state def all_running(): return all(t.state == task.RUNNING for t in p_job_median.get_tasks().values()) p_job_median.wait_for_condition(all_running) # decorate the client to add peloton private API stubs client = with_private_stubs(peloton_client) p_job_large = Job( job_file='test_hostreservation_job_large.yaml', pool=hostreservepool, config=IntegrationTestConfig( sleep_time_sec=1, max_retry_attempts=300), options=[with_instance_count(1)], client=client, ) p_job_large.create() p_job_large.wait_for_state(goal_state='PENDING') request = hostmgr.GetHostsByQueryRequest() # task should get into reserved state and RUNNING state t1 = p_job_large.get_task(0) t1.wait_for_pending_state(goal_state="RESERVED") # the task is running on reserved host def get_reserved_host(): resp = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for h in resp.hosts: if h.status == 'reserved': return h.hostname return '' def is_reserved(): return get_reserved_host() != '' p_job_large.wait_for_condition(is_reserved) reserved_host = get_reserved_host() t1.wait_for_pending_state(goal_state="RUNNING") assert reserved_host == t1.get_info().runtime.host # p_job_large should succeed p_job_large.wait_for_state() # no host is in reserved state response = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for host in response.hosts: assert host.status != 'reserved' kill_jobs([p_job_median, p_job_large])