Exemplo n.º 1
0
 def make_dataframe(self, *args, **kwargs):
     """
     Creates an instance of distributed RDataFrame that can send computations
     to a Dask cluster.
     """
     # Set the number of partitions for this dataframe, one of the following:
     # 1. User-supplied `npartitions` optional argument
     npartitions = kwargs.pop("npartitions", None)
     headnode = HeadNode.get_headnode(self, npartitions, *args)
     return DataFrame.RDataFrame(headnode)
Exemplo n.º 2
0
 def make_dataframe(self, *args, **kwargs):
     """
     Creates an instance of distributed RDataFrame that can send computations
     to a Dask cluster.
     """
     # Set the number of partitions for this dataframe, one of the following:
     # 1. User-supplied `npartitions` optional argument
     # 2. An educated guess according to the backend, using the backend's
     #    `optimize_npartitions` function
     # 3. Set `npartitions` to 2
     npartitions = kwargs.pop("npartitions", self.optimize_npartitions())
     headnode = HeadNode.get_headnode(self, npartitions, *args)
     return DataFrame.RDataFrame(headnode)
Exemplo n.º 3
0
    def test_count_result_invariance(self):
        """
        Tests that counting the entries in the dataset does not depend on the
        number of partitions. This could have happened if we used TEntryList
        to restrict processing on a certain range of entries of the TChain in a
        distributed task, but the changes in
        https://github.com/root-project/root/commit/77bd5aa82e9544811e0d5fce197ab87c739c2e23
        were not implemented yet.
        """
        treename = "entries"
        filenames = ["1cluster_20entries.root"] * 5

        for npartitions in range(1, 6):
            headnode = HeadNode.get_headnode(npartitions, treename, filenames)
            backend = DistRDataFrameInvariants.TestBackend()
            rdf = DataFrame.RDataFrame(headnode, backend)
            self.assertEqual(rdf.Count().GetValue(), 100)
Exemplo n.º 4
0
 def make_dataframe(self, *args, **kwargs):
     """Creates an instance of SparkDataFrame"""
     headnode = Node.HeadNode(*args)
     return DataFrame.RDataFrame(headnode, self, **kwargs)