def _(mergeable: SnapshotResult, node, backend): """ Connects the final value after distributed computation to the corresponding DistRDF node. This overload calls the `GetValue` method of `SnapshotResult`. This method accepts a 'backend' parameter because we need to recreate a distributed RDataFrame with the same backend of the input one. """ node.value = mergeable.GetValue(backend)
def _(operation: Snapshot, promise: Any, results: list) -> None: results.append(SnapshotResult(operation.args[0], [operation.args[1]]))
def generate_computation_graph(self, previous_node, range_id, distrdf_node=None): """ Generates the RDF computation graph by recursively retrieving information from the DistRDF nodes. Args: previous_node (Any): The node in the RDF computation graph on which the operation of the current recursive state is called. In the first recursive state, this corresponds to the RDataFrame object that will be processed. Specifically, if the head node of the computation graph is an EmptySourceHeadNode, then the first current node will actually be the result of a call to the Range operation. If the head node is a TreeHeadNode then the node will be an actual RDataFrame. Successive recursive states will receive the result of an RDF operation call (e.g. Histo1D, Count). range_id (int): The id of the current range. Needed to assign a file name to a partial Snapshot if it was requested. distrdf_node (DistRDF.Node.Node | None): The current DistRDF node in the computation graph. In the first recursive state this is None and it will be set equal to the DistRDF headnode. Returns: list: List of actions of the computation graph to be triggered. Each element is some kind of promise of a result (usually an RResultPtr). Exceptions are the 'AsNumpy' operation for which an 'AsNumpyResult' is returned and the 'Snapshot' operation for which a 'SnapshotResult' is returned. """ future_results = [] if distrdf_node is None: # In the first recursive state, just set the # current DistRDF node as the head node distrdf_node = self.headnode else: # Execute the current operation using the output of the previous # node RDFOperation = getattr(previous_node, distrdf_node.operation.name) operation = distrdf_node.operation self._make_op_lazy_if_needed(operation, range_id) pyroot_node = RDFOperation(*operation.args, **operation.kwargs) # The result is a pyroot object which is stored together with # the DistRDF node. This binds the pyroot object lifetime to the # DistRDF node, so both nodes will be kept alive as long as there # is a valid reference pointing to the DistRDF node. distrdf_node.pyroot_node = pyroot_node # Set the next `previous_node` input argument to the `pyroot_node` # we just retrieved previous_node = pyroot_node if (operation.is_action() or operation.is_instant_action()): if operation.name == "Snapshot": future_results.append( SnapshotResult(operation.args[0], [operation.args[1]])) else: future_results.append(pyroot_node) for child_node in distrdf_node.children: # Recurse through children and get their output prev_results = self.generate_computation_graph( previous_node, range_id, child_node) # Attach the output of the children node future_results.extend(prev_results) return future_results
def _run_function(self, wf_id: int) -> Tuple[List, List[str]]: ''' Runs the workflow generation function. Args: wf_id (int): identifier of the workflow function to be executed. Returns: tuple: the first element is the list of results of the actions in the C++ workflow, the second element is the list of result types corresponding to those actions. ''' ns = getattr(ROOT, CppWorkflow._FUNCTION_NAMESPACE) func = getattr(ns, CppWorkflow._FUNCTION_NAME + str(wf_id)) # Run the workflow generator function vectors = func(self.starting_node) # need to keep the tuple alive v_results, v_res_types, v_nodes = vectors # Convert the vector of results into a list so that we can mix # different types in it. # We copy the results since the life of the original ones is tied to # that of the vector results = [ROOT.RDF.RResultHandle(res) for res in v_results] # Strip out the ROOT::RDF::RResultPtr<> part of the type def get_result_type(s): if s.empty(): # Python-only actions have an empty return type in C++ return '' s = str(s) pos = s.find('<') if pos == -1: raise RuntimeError( 'Error parsing the result types of RDataFrame workflow') return s[pos + 1:-1].strip() res_types = [get_result_type(elem) for elem in v_res_types] # Add Python-only actions on their corresponding nodes for (res_ptr_id, operation), n in zip(self._py_actions, v_nodes): operation.kwargs['lazy'] = True # make it lazy results[res_ptr_id] = getattr(n, operation.name)(*operation.args, **operation.kwargs) if v_results: # We trigger the event loop here, so make sure we release the GIL RunGraphs = ROOT.RDF.RunGraphs old_rg = RunGraphs.__release_gil__ RunGraphs.__release_gil__ = True RunGraphs(v_results) RunGraphs.__release_gil__ = old_rg # Replace the RResultHandle of each Snapshot by its modified output # path, since the latter is what we actually need in the reducer for res_ptr_id, treename, path in self._snapshots: results[res_ptr_id] = SnapshotResult(treename, [path]) res_types[res_ptr_id] = None # placeholder # AsNumpyResult needs to be triggered before being merged for i, operation in self._py_actions: results[i].GetValue() return results, res_types
def _(mergeable: SnapshotResult, node, backend): node.value = mergeable.GetValue(backend)