def test_at_task_exit(n=2): """Test checkpointing at task_exit behavior """ d = {} print("Launching: ", n) for i in range(0, n): d[i] = slow_double(i) print("Done launching") for i in range(0, n): d[i].result() with time_limited_open("{}/checkpoint/tasks.pkl".format(dfk.rundir), 'rb', seconds=5) as f: tasks = [] try: while f: tasks.append(pickle.load(f)) except EOFError: pass assert len(tasks) == n, "Expected {} checkpoint events, got {}".format( n, len(tasks))
def test_checkpointing_at_dfk_exit(): """Ensure failed tasks are not cached with dfk_exit mode. Tests #239 """ rundir = run_checkpointed(mode="dfk_exit") with time_limited_open("{}/checkpoint/tasks.pkl".format(rundir), 'rb', seconds=2) as f: tasks = [] try: while f: tasks.append(pickle.load(f)) print except EOFError: pass print("Tasks from cache : ", tasks) assert len(tasks) == 1, "Expected {} checkpoint events, got {}".format(1, len(tasks))
def test_at_task_exit(n=2): """Test checkpointing at task_exit behavior """ d = {} print("Launching: ", n) for i in range(0, n): d[i] = slow_double(i) print("Done launching") for i in range(0, n): d[i].result() # There are two potential race conditions here which # might be useful to be aware of if debugging this test. # i) .result() returning does not necessarily mean that # a checkpoint that has been written: it means that the # AppFuture has had its result written. In the DFK # implementation at time of writing, .result() returning # does not indicate that a checkpoint has been written, # it seems like. # ii) time_limited_open has a specific time limit in it. # While this limit might seem generous at time of writing, # it should be remembered that this is still a race. with time_limited_open("{}/checkpoint/tasks.pkl".format(dfk.run_dir), 'rb', seconds=5) as f: tasks = [] try: while f: tasks.append(pickle.load(f)) except EOFError: pass assert len(tasks) == n, "Expected {} checkpoint events, got {}".format( n, len(tasks))