def run_task_workload(total_num_cpus, smoke): """Run task-based workload that doesn't require object reconstruction.""" @ray.remote(num_cpus=1, max_retries=-1) def task(): def generate_data(size_in_kb=10): return np.zeros(1024 * size_in_kb, dtype=np.uint8) a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return generate_data(size_in_kb=50) @ray.remote(num_cpus=1, max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) multiplier = 75 # For smoke mode, run less number of tasks if smoke: multiplier = 1 TOTAL_TASKS = int(total_num_cpus * 2 * multiplier) pb = ProgressBar("Chaos test", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] pb.block_until_complete(results) pb.close() # Consistency check. wait_for_condition( lambda: (ray.cluster_resources().get("CPU", 0) == ray. available_resources().get("CPU", 0)), timeout=60, )
def test_chaos_task_retry(set_kill_interval): # Chaos testing. @ray.remote(max_retries=-1) def task(): a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return @ray.remote(max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) # 50MB of return values. TOTAL_TASKS = 100 pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] start = time.time() pb.block_until_complete(results) runtime_with_failure = time.time() - start print(f"Runtime when there are many failures: {runtime_with_failure}") pb.close()