Exemplo n.º 1
0
    def GetValue(self):
        """Triggers, if necessary, the event loop to run the Take actions for
        the requested columns and produce the NumPy arrays as result.

        Returns:
            dict: key is the column name, value is the NumPy array for that
                column.
        """

        if self._py_arrays is None:
            import numpy
            from ROOT.pythonization._rdf_utils import ndarray

            # Convert the C++ vectors to numpy arrays
            self._py_arrays = {}
            for column in self._columns:
                cpp_reference = self._result_ptrs[column].GetValue()
                if hasattr(cpp_reference, "__array_interface__"):
                    tmp = numpy.asarray(
                        cpp_reference
                    )  # This adopts the memory of the C++ object.
                    self._py_arrays[column] = ndarray(
                        tmp, self._result_ptrs[column])
                else:
                    tmp = numpy.empty(len(cpp_reference), dtype=numpy.object)
                    for i, x in enumerate(cpp_reference):
                        tmp[i] = x  # This creates only the wrapping of the objects and does not copy.
                    self._py_arrays[column] = ndarray(
                        tmp, self._result_ptrs[column])

        return self._py_arrays
Exemplo n.º 2
0
def RDataFrameAsNumpy(df, columns=None, exclude=None):
    """Read-out the RDataFrame as a collection of numpy arrays.

    The values of the dataframe are read out as numpy array of the respective type
    if the type is a fundamental type such as float or int. If the type of the column
    is a complex type, such as your custom class or a std::array, the returned numpy
    array contains Python objects of this type interpreted via PyROOT.

    Be aware that reading out custom types is much less performant than reading out
    fundamental types, such as int or float, which are supported directly by numpy.

    The reading is performed in multiple threads if the implicit multi-threading of
    ROOT is enabled.

    Note that this is an instant action of the RDataFrame graph and will trigger the
    event-loop.

    Parameters:
        columns: If None return all branches as columns, otherwise specify names in iterable.
        exclude: Exclude branches from selection.

    Returns:
        dict: Dict with column names as keys and 1D numpy arrays with content as values
    """
    # Import numpy and numpy.array derived class lazily
    try:
        import numpy
        from ROOT.pythonization._rdf_utils import ndarray
    except:
        raise ImportError(
            "Failed to import numpy during call of RDataFrame.AsNumpy.")

    # Find all column names in the dataframe if no column are specified
    if not columns:
        columns = [str(c) for c in df.GetColumnNames()]

    # Exclude the specified columns
    if exclude == None:
        exclude = []
    columns = [col for col in columns if not col in exclude]

    # Register Take action for each column
    result_ptrs = {}
    for column in columns:
        column_type = df.GetColumnType(column)
        result_ptrs[column] = df.Take[column_type](column)

    # Convert the C++ vectors to numpy arrays
    py_arrays = {}
    for column in columns:
        cpp_reference = result_ptrs[column].GetValue()
        if hasattr(cpp_reference, "__array_interface__"):
            tmp = numpy.asarray(
                cpp_reference)  # This adopts the memory of the C++ object.
            py_arrays[column] = ndarray(tmp, result_ptrs[column])
        else:
            tmp = numpy.empty(len(cpp_reference), dtype=numpy.object)
            for i, x in enumerate(cpp_reference):
                tmp[i] = x  # This creates only the wrapping of the objects and does not copy.
            py_arrays[column] = ndarray(tmp, result_ptrs[column])

    return py_arrays