def test_full_branching(insertion_order): """Test full binary tree, in random order""" tracker = RootTracker() for node in insertion_order: tracker.add(node, binary_parent(node)) # prune all the way to the leaf of (3, 0) for num_prunings in range(3): root_id, depth = tracker.get_root((3, 0)) assert root_id[0] == num_prunings assert depth == 3 - num_prunings tracker.prune(root_id) assert tracker.get_root((3, 7)) == ((1, 1), 2)
def test_out_of_order_line(insertion_order): tracker = RootTracker() for node in insertion_order: tracker.add(node, node - 1) for expected_root in range(9): for node in range(expected_root, 10): root, depth = tracker.get_root(node) assert depth == node - expected_root assert root == expected_root # always prune from the end prune_root_id, _ = tracker.get_root(9) tracker.prune(prune_root_id) # root should not be retrievable anymore with pytest.raises(ValidationError): tracker.get_root(prune_root_id)
def test_sparse_branching(test_data): nodes_to_insert, prune_order = test_data def get_expected_root(node, present_nodes): expected_depth = 0 expected_root = node parent_node = binary_parent(node) while parent_node in present_nodes: expected_depth += 1 expected_root = parent_node parent_node = binary_parent(parent_node) return expected_root, expected_depth tracker = RootTracker() for node in nodes_to_insert: tracker.add(node, binary_parent(node)) # verify parent and depth of partially-built tree for node in nodes_to_insert: actual_root, actual_depth = tracker.get_root(node) expected_root, expected_depth = get_expected_root( node, nodes_to_insert) assert actual_root == expected_root assert actual_depth == expected_depth # prune remaining_nodes = set(nodes_to_insert) for prune_idx in [ idx for idx in prune_order if idx < len(nodes_to_insert) ]: node_to_prune_from = nodes_to_insert[prune_idx] if node_to_prune_from not in remaining_nodes: continue prune_root_id, _ = tracker.get_root(node_to_prune_from) tracker.prune(prune_root_id) remaining_nodes.remove(prune_root_id) for node in remaining_nodes: actual_root, actual_depth = tracker.get_root(node) expected_root, expected_depth = get_expected_root( node, remaining_nodes) assert actual_root == expected_root assert actual_depth == expected_depth
def test_prune_reinsert_root_tracking_linear(element_flipping): tracker = RootTracker() present = set() for node in element_flipping: if node in present: prune_root_id, _ = tracker.get_root(node) tracker.prune(prune_root_id) present.remove(prune_root_id) else: tracker.add(node, node - 1) present.add(node) # validate all the present nodes have valid roots for test_node in present: root_id, depth = tracker.get_root(test_node) # make sure parent is *not* present assert root_id - 1 not in present # make sure depth is correct assert depth == test_node - root_id
def test_prune_reinsert_root_tracking_binary_tree(element_flipping): tracker = RootTracker() present = set() for node_id in element_flipping: node = FULL_BINARY_TREE[node_id] if node in present: prune_root_id, _ = tracker.get_root(node) tracker.prune(prune_root_id) present.remove(prune_root_id) else: tracker.add(node, binary_parent(node)) present.add(node) # validate all the present nodes have valid roots for test_node in present: root_node, depth = tracker.get_root(test_node) # make sure parent is *not* present assert binary_parent(root_node) not in present # make sure depth is correct assert depth == test_node[0] - root_node[0]
class OrderedTaskPreparation(BaseOrderedTaskPreparation[TTask, TTaskID], Generic[TTask, TTaskID, TPrerequisite]): """ This class is useful when a series of tasks with prerequisites must be run sequentially. The prerequisites may be finished in any order, but the tasks may only be run when all prerequisites are complete, and the dependent task is also complete. Tasks may only depend on one other task. For example, you might want to download block bodies and receipts at random, but need to import them sequentially. Importing blocks is the ``task``, downloading the parts is the ``prerequisite``, and a block's parent is its ``dependency``. Below is a sketch of how to do that: # The complete list of prerequisites to complete class BlockDownloads(Enum): receipts = auto() bodies = auto() block_import_tasks = OrderedTaskPreparation( BlockDownloads, # we use this method to extract an ID from the header: lambda header: header.hash, # we use this method to extract the ID of the dependency, # so that we can guarantee that the parent block gets imported first lambda header: header.parent_hash, ) # We mark the genesis block as already imported, so header1 is ready # as soon as its prerequisites are complete. block_import_tasks.set_finished_dependency(header0) # We register the tasks before completing any prerequisites block_import_tasks.register_tasks((header1, header2, header3)) # Start download of bodies & receipts... # They complete in random order # we notify this class which prerequisites are complete: block_import_tasks.finish_prereq(BlockDownloads.receipts, (header2, header3)) block_import_tasks.finish_prereq(BlockDownloads.bodies, (header1, header2)) # this await would hang, waiting on the receipt from header1: # await block_import_tasks.ready_tasks() block_import_tasks.finish_prereq(BlockDownloads.receipts, (header1, )) # now we have all the necessary info to import blocks 1 and 2 headers_ready_to_import = await block_import_tasks.ready_tasks() # these will always return in sequential order: assert headers_ready_to_import == (header1, header2) In a real implementation, you would have a loop waiting on :meth:`ready_tasks` and import them, rather than interleaving them like the above example. Note that this class does *not* track when the main tasks are complete. It is assumed that the caller will complete the tasks in the order they are returned by ready_tasks(). The memory needs of this class would naively be unbounded. Any newly registered task might depend on any other task in history. To prevent unbounded memory usage, old tasks are pruned after a configurable depth. Pruning is triggered when `ready_tasks()` is called, starting from the tail of the *previous* ready_tasks() result. Vocab: - prerequisites: all these must be completed for a task to be ready (a necessary but not sufficient condition) - ready: a task is ready after all its prereqs are completed, and the task it depends on is also ready. The initial ready task is set with :meth:`set_finished_dependency` """ # methods to extract the id and dependency IDs out of a task _id_of: StaticMethod[Callable[[TTask], TTaskID]] _dependency_of: StaticMethod[Callable[[TTask], TTaskID]] # by default, how long should the integrator wait before pruning? _default_max_depth = 10 # not sure how to pick a good default here _prereq_tracker: Type[BaseTaskPrerequisites[TTask, TPrerequisite]] # track roots _roots: RootTracker[TTaskID] NoPrerequisites = NoPrerequisites """ This is a helper to identify that no prerequisites are required at all, only ordering of tasks It can be used like so: `OrderedTaskPreparation(OrderedTaskPreparation.NoPrerequisites, ...)` """ def __init__(self, prerequisites: Type[TPrerequisite], id_extractor: Callable[[TTask], TTaskID], dependency_extractor: Callable[[TTask], TTaskID], accept_dangling_tasks: bool = False, max_depth: int = None) -> None: self._prereq_tracker = BaseTaskPrerequisites.from_enum(prerequisites) self._id_of = id_extractor self._dependency_of = dependency_extractor self._accept_dangling_tasks = accept_dangling_tasks # how long to wait before pruning if max_depth is None: self._max_depth = self._default_max_depth elif max_depth < 0: raise ValidationError( f"The maximum depth must be at least 0, not {max_depth}") else: self._max_depth = max_depth # all of the tasks that have been completed, and not pruned self._tasks: Dict[TTaskID, BaseTaskPrerequisites[TTask, TPrerequisite]] = {} # In self._dependencies, when the key becomes ready, the task ids in the # value set *might* also become ready # (they only become ready if their prerequisites are complete) self._dependencies: Dict[TTaskID, Set[TTaskID]] = defaultdict(set) # task ids are in this set if either: # - one of their prerequisites is incomplete OR # - their dependent task is not ready self._unready: Set[TTaskID] = set() # This is a queue of tasks that have become ready, in order. # They wait in this Queue until being returned by ready_tasks(). self._ready_tasks: 'Queue[TTask]' = Queue() # Declared finished with set_finished_dependency() self._declared_finished: Set[TTaskID] = set() self._roots = RootTracker() self._last_yielded_tasks: Tuple[TTask, ...] = tuple() def set_finished_dependency(self, finished_task: TTask) -> None: """ Mark this task as already finished. This is a bootstrapping method. In general, tasks are marked as finished by :meth:`finish_prereq`. But how do we know which task is first, and that its dependency is complete? We call `set_finished_dependency`. Since a task can only become ready when its dependent task is ready, the first result from ready_tasks will be dependent on finished_task set in this method. (More precisely, it will be dependent on *one of* the ``finished_task`` objects set with this method, since the method may be called multiple times) """ completed = self._prereq_tracker(finished_task) completed.set_complete() task_id = self._id_of(finished_task) if task_id in self._tasks: raise DuplicateTasks( f"Can't set a new finished dependency {finished_task} id:{task_id}, " "it's already registered", (finished_task, ), ) self._tasks[task_id] = completed self._declared_finished.add(task_id) dependency_id = self._dependency_of(finished_task) self._roots.add(task_id, dependency_id) if dependency_id in self._tasks: # set a finished dependency that has a parent already entered. Mark this as a dependency self._dependencies[dependency_id].add(task_id) # note that this task is intentionally *not* added to self._unready def register_tasks(self, tasks: Tuple[TTask, ...], ignore_duplicates: bool = False) -> None: """ Initiate a task into tracking. By default, each task must be registered *after* its dependency has been registered. If you want to be able to register non-contiguous tasks, you can initialize this intance with: ``accept_dangling_tasks=True``. :param tasks: the tasks to register, in iteration order :param ignore_duplicates: any tasks that have already been registered will be ignored, whether ready or not """ identified_tasks = tuple((self._id_of(task), task) for task in tasks) duplicates = tuple(task for task_id, task in identified_tasks if task_id in self._tasks) if duplicates and not ignore_duplicates: raise DuplicateTasks( f"Cannot re-register tasks: {duplicates!r} for completion", duplicates, ) task_meta_info = tuple( (self._prereq_tracker(task), task_id, self._dependency_of(task)) for task_id, task in identified_tasks # when ignoring duplicates, must not try to re-add them if task_id not in self._tasks) for prereq_tracker, task_id, dependency_id in task_meta_info: if not self._accept_dangling_tasks and dependency_id not in self._tasks: raise MissingDependency( f"Cannot prepare task {prereq_tracker!r} with id {task_id} and " f"dependency {dependency_id} before preparing its dependency " f"among tasks {task_meta_info!r}, from the original registration: " f"{tasks!r}.") else: self._tasks[task_id] = prereq_tracker self._unready.add(task_id) self._dependencies[dependency_id].add(task_id) self._roots.add(task_id, dependency_id) if prereq_tracker.is_complete and self._is_ready( prereq_tracker.task): # this is possible for tasks with 0 prerequisites (useful for pure ordering) self._mark_complete(task_id) def finish_prereq(self, prereq: TPrerequisite, tasks: Tuple[TTask, ...]) -> None: """For every task in tasks, mark the given prerequisite as completed""" if len(self._tasks) == 0: raise ValidationError( "Cannot finish a task until set_last_completion() is called") for task in tasks: task_id = self._id_of(task) if task_id not in self._tasks: raise ValidationError( f"Cannot finish task {task_id!r} before preparing it") elif task_id not in self._unready: raise ValidationError( f"Cannot finish prereq {prereq} of task {task} id:{task_id!r} that is complete" ) task_completion = self._tasks[task_id] task_completion.finish(prereq) if task_completion.is_complete and self._is_ready(task): self._mark_complete(task_id) async def ready_tasks(self, max_results: int = None) -> Tuple[TTask, ...]: """ Return the next batch of tasks that are ready to process. If none are ready, hang until at least one task becomes ready. """ for completed_task in self._last_yielded_tasks: task_id = self._id_of(completed_task) # Attempt pruning at least twice (to eventually catch up after forks) # re-running is okay, because pruning limits the prune depth self._prune_finished(task_id) self._prune_finished(task_id) self._last_yielded_tasks = await queue_get_batch( self._ready_tasks, max_results) return self._last_yielded_tasks def has_ready_tasks(self) -> bool: return not self._ready_tasks.empty() def _is_ready(self, task: TTask) -> bool: dependency = self._dependency_of(task) if dependency in self._declared_finished: # Ready by declaration return True elif dependency in self._tasks and dependency not in self._unready: # Ready by insertion and tracked completion return True else: return False def _mark_complete(self, task_id: TTaskID) -> None: qualified_tasks = tuple([task_id]) while qualified_tasks: qualified_tasks = tuple( mapcat( self._mark_one_task_complete, qualified_tasks, )) @to_tuple def _mark_one_task_complete(self, task_id: TTaskID) -> Iterable[TTaskID]: """ Called when this task is completed and its dependency is complete, for the first time :return: any task IDs that can now also be marked as complete """ task_completion = self._tasks[task_id] # put this task in the completed queue self._ready_tasks.put_nowait(task_completion.task) # note that this task has been made ready self._unready.remove(task_id) # resolve tasks that depend on this task for depending_task_id in self._dependencies[task_id]: # we already know that this task is ready, so we only need to check completion if self._tasks[depending_task_id].is_complete: yield depending_task_id def _prune_finished(self, task_id: TTaskID) -> None: """ This prunes the oldest data, if it starts more than _max_depth in history. It is called when the task has been consumed by the caller via `ready_tasks()` and completed. The workflow looks something like: :return: True if a node was pruned :: otp = OrderedTaskPreparation(...) otp.register_tasks(range(3)) otp.finish_prereq(OnlyPrereq, (0, 1)) assert await otp.ready_tasks() == (0, 1) # Do some processing on ready tasks (0, 1) ... # Complete processing on ready tasks (0, 1) await otp.ready_tasks() # ^ when this is called, pruning is triggered from # the tip of task 1 (whether or not task 2 is ready) """ root_task_id, depth = self._roots.get_root(task_id) num_to_prune = depth - self._max_depth if num_to_prune <= 0: return else: self._prune(root_task_id) def _prune(self, prune_task_id: TTaskID) -> None: # _roots.prune() has validation in it, so if there is a problem, we should skip the rest self._roots.prune(prune_task_id) del self._dependencies[prune_task_id] del self._tasks[prune_task_id] if prune_task_id in self._declared_finished: self._declared_finished.remove(prune_task_id)