Пример #1
0
def _(mergeable: SnapshotResult, node, backend):
    """
    Connects the final value after distributed computation to the corresponding
    DistRDF node.
    This overload calls the `GetValue` method of `SnapshotResult`. This method
    accepts a 'backend' parameter because we need to recreate a distributed
    RDataFrame with the same backend of the input one.
    """
    node.value = mergeable.GetValue(backend)
Пример #2
0
def _(operation: Snapshot, promise: Any, results: list) -> None:
    results.append(SnapshotResult(operation.args[0], [operation.args[1]]))
    def generate_computation_graph(self,
                                   previous_node,
                                   range_id,
                                   distrdf_node=None):
        """
        Generates the RDF computation graph by recursively retrieving
        information from the DistRDF nodes.

        Args:
            previous_node (Any): The node in the RDF computation graph on which
                the operation of the current recursive state is called. In the
                first recursive state, this corresponds to the RDataFrame
                object that will be processed. Specifically, if the head node
                of the computation graph is an EmptySourceHeadNode, then the
                first current node will actually be the result of a call to the
                Range operation. If the head node is a TreeHeadNode then the
                node will be an actual RDataFrame. Successive recursive states
                will receive the result of an RDF operation call
                (e.g. Histo1D, Count).
            range_id (int): The id of the current range. Needed to assign a
                file name to a partial Snapshot if it was requested.
            distrdf_node (DistRDF.Node.Node | None): The current DistRDF node in
                the computation graph. In the first recursive state this is None
                and it will be set equal to the DistRDF headnode.

        Returns:
            list: List of actions of the computation graph to be triggered. Each
            element is some kind of promise of a result (usually an
            RResultPtr). Exceptions are the 'AsNumpy' operation for which an
            'AsNumpyResult' is returned and the 'Snapshot' operation for which a
            'SnapshotResult' is returned.
        """
        future_results = []

        if distrdf_node is None:
            # In the first recursive state, just set the
            # current DistRDF node as the head node
            distrdf_node = self.headnode
        else:
            # Execute the current operation using the output of the previous
            # node
            RDFOperation = getattr(previous_node, distrdf_node.operation.name)
            operation = distrdf_node.operation
            self._make_op_lazy_if_needed(operation, range_id)
            pyroot_node = RDFOperation(*operation.args, **operation.kwargs)

            # The result is a pyroot object which is stored together with
            # the DistRDF node. This binds the pyroot object lifetime to the
            # DistRDF node, so both nodes will be kept alive as long as there
            # is a valid reference pointing to the DistRDF node.
            distrdf_node.pyroot_node = pyroot_node

            # Set the next `previous_node` input argument to the `pyroot_node`
            # we just retrieved
            previous_node = pyroot_node

            if (operation.is_action() or operation.is_instant_action()):
                if operation.name == "Snapshot":
                    future_results.append(
                        SnapshotResult(operation.args[0], [operation.args[1]]))
                else:
                    future_results.append(pyroot_node)

        for child_node in distrdf_node.children:
            # Recurse through children and get their output
            prev_results = self.generate_computation_graph(
                previous_node, range_id, child_node)

            # Attach the output of the children node
            future_results.extend(prev_results)

        return future_results
Пример #4
0
    def _run_function(self, wf_id: int) -> Tuple[List, List[str]]:
        '''
        Runs the workflow generation function.

        Args:
            wf_id (int): identifier of the workflow function to be executed.

        Returns:
            tuple: the first element is the list of results of the actions in
                the C++ workflow, the second element is the list of result types
                corresponding to those actions.
        '''

        ns = getattr(ROOT, CppWorkflow._FUNCTION_NAMESPACE)
        func = getattr(ns, CppWorkflow._FUNCTION_NAME + str(wf_id))

        # Run the workflow generator function
        vectors = func(self.starting_node)  # need to keep the tuple alive
        v_results, v_res_types, v_nodes = vectors

        # Convert the vector of results into a list so that we can mix
        # different types in it.
        # We copy the results since the life of the original ones is tied to
        # that of the vector
        results = [ROOT.RDF.RResultHandle(res) for res in v_results]

        # Strip out the ROOT::RDF::RResultPtr<> part of the type
        def get_result_type(s):
            if s.empty():
                # Python-only actions have an empty return type in C++
                return ''

            s = str(s)
            pos = s.find('<')
            if pos == -1:
                raise RuntimeError(
                    'Error parsing the result types of RDataFrame workflow')
            return s[pos + 1:-1].strip()

        res_types = [get_result_type(elem) for elem in v_res_types]

        # Add Python-only actions on their corresponding nodes
        for (res_ptr_id, operation), n in zip(self._py_actions, v_nodes):
            operation.kwargs['lazy'] = True  # make it lazy
            results[res_ptr_id] = getattr(n,
                                          operation.name)(*operation.args,
                                                          **operation.kwargs)

        if v_results:
            # We trigger the event loop here, so make sure we release the GIL
            RunGraphs = ROOT.RDF.RunGraphs
            old_rg = RunGraphs.__release_gil__
            RunGraphs.__release_gil__ = True
            RunGraphs(v_results)
            RunGraphs.__release_gil__ = old_rg

        # Replace the RResultHandle of each Snapshot by its modified output
        # path, since the latter is what we actually need in the reducer
        for res_ptr_id, treename, path in self._snapshots:
            results[res_ptr_id] = SnapshotResult(treename, [path])
            res_types[res_ptr_id] = None  # placeholder

        # AsNumpyResult needs to be triggered before being merged
        for i, operation in self._py_actions:
            results[i].GetValue()

        return results, res_types
Пример #5
0
def _(mergeable: SnapshotResult, node, backend):
    node.value = mergeable.GetValue(backend)