示例#1
0
    def test_definepersample_withinitialization(self, connection):
        """
        Test DefinePerSample operation on three samples using C++ functions
        declared to the ROOT interpreter.
        """

        # Write initialization code that will be run in the workers to make the
        # needed functions available
        def declare_definepersample_code():
            ROOT.gInterpreter.Declare('''
            #ifndef distrdf_test_definepersample_withinitialization
            #define distrdf_test_definepersample_withinitialization
            float sample1_weight(){
                return 1.0f;
            }

            float sample2_weight(){
                return 2.0f;
            }

            float sample3_weight(){
                return 3.0f;
            }

            float samples_weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
                if (id.Contains("sample1")){
                    return sample1_weight();
                } else if (id.Contains("sample2")){
                    return sample2_weight();
                } else if (id.Contains("sample3")){
                    return sample3_weight();
                }
                return -999.0f;
            }

            std::string samples_names(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
                return id.AsString();
            }
            #endif // distrdf_test_definepersample_withinitialization
            ''')

        DistRDF.initialize(declare_definepersample_code)
        df = Spark.RDataFrame(self.maintreename,
                              self.filenames,
                              sparkcontext=connection)
        df1 = df.DefinePerSample("sample_weight", "samples_weights(rdfslot_, rdfsampleinfo_)")\
                .DefinePerSample("sample_name", "samples_names(rdfslot_, rdfsampleinfo_)")

        # Filter by the two defined columns per sample: a weight and the sample string representation
        # Each filtered dataset should have 10 entries, equal to the number of entries per sample
        weightsandnames = [("1.0f", "sample1.root/Events"),
                           ("2.0f", "sample2.root/Events"),
                           ("3.0f", "sample3.root/Events")]
        samplescounts = [
            df1.Filter("sample_weight == {} && sample_name == \"{}\"".format(
                weight, name)).Count() for (weight, name) in weightsandnames
        ]

        for count in samplescounts:
            assert count.GetValue() == 10
    def _extend_ROOT_include_path(self, connection):
        """
        Check that the include path of ROOT is extended with the directories
        specified in `DistRDF.include_headers()` so references between headers
        are correctly solved.
        """

        # Create an RDataFrame with 100 integers from 0 to 99
        rdf = Spark.RDataFrame(100, sparkcontext=connection)

        # Distribute headers to the workers
        header_folder = "../test_headers/headers_folder"
        rdf._headnode.backend.distribute_headers(header_folder)

        # Get list of include paths seen by ROOT
        ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ")

        # Create new include folder token
        new_folder_include = "-I\"{}\"".format(header_folder)

        # Check that new folder is in ROOT include paths
        assert new_folder_include in ROOT_include_path

        # Filter numbers less than 10 and create an histogram
        rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)")
        histo1 = rdf_less_than_10.Histo1D("tdfentry_")

        # Check that histogram has 10 entries and mean 4.5
        assert histo1.GetEntries() == 10
        assert histo1.GetMean() == pytest.approx(4.5)
示例#3
0
    def test_tchain_with_friend_tchain_histo(self, connection):
        """
        Tests that the computational graph can be issued both on the
        parent chain and the friend chain.
        """

        main_filename = "main_chain.root"
        friend_filename = "friend_chain.root"

        self.create_parent_tree(main_filename)
        self.create_friend_tree(friend_filename)

        # Main TChain
        mainchain = ROOT.TChain("T")
        mainchain.Add(main_filename)

        # Friend TChain
        friendchain = ROOT.TChain("TF")
        friendchain.Add(friend_filename)

        # Add friend chain to the main one
        mainchain.AddFriend(friendchain)

        # Create a DistRDF RDataFrame with the main and the friend chains
        df = Spark.RDataFrame(mainchain, sparkcontext=connection)

        # Create histograms
        h_parent = df.Histo1D("x")
        h_friend = df.Histo1D("TF.x")

        check_histograms(h_parent, h_friend)

        # Remove unnecessary .root files
        os.remove(main_filename)
        os.remove(friend_filename)
示例#4
0
    def test_initialization_method(self, connection):
        """
        Check `DistRDF.initialize` with Spark backend. Defines an integer value
        to the ROOT interpreter. Check that this value is available in the
        worker processes.
        """
        def init(value):
            import ROOT
            cpp_code = f"int userValue = {value};"
            ROOT.gInterpreter.ProcessLine(cpp_code)

        DistRDF.initialize(init, 123)
        # Spark backend has a limited list of supported methods, so we use
        # Histo1D which is a supported action.
        # The code below creates an RDataFrame instance with one single entry
        # and defines a column 'u' whose value is taken from the variable
        # 'userValue'.
        # This variable is only declared inside the ROOT interpreter, however
        # the value of the variable is passed by the user from the python side.
        # If the init function defined by the user is properly propagated to the
        # Spark backend, each workers will run the init function as a first step
        # and hence the variable 'userValue' will be defined at runtime.
        # As a result the define operation should read the variable 'userValue'
        # and assign it to the entries of the column 'u' (only one entry).
        # Finally, Histo1D returns a histogram filled with one value. The mean
        # of this single value has to be the value itself, independently of
        # the number of spawned workers.
        df = Spark.RDataFrame(1, sparkcontext=connection).Define(
            "u", "userValue").Histo1D("u")
        h = df.GetValue()
        assert h.GetMean() == 123
示例#5
0
    def test_distributed_snapshot_columnlist(self, connection):
        """
        Test that distributed Snapshot correctly passes also the third input
        argument "columnList".
        """
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Spark.RDataFrame(10, sparkcontext=connection)\
            .Define("a", "rdfentry_")\
            .Define("b", "rdfentry_")\
            .Define("c", "rdfentry_")\
            .Define("d", "rdfentry_")

        expectedcolumns = ["a", "b"]
        df.Snapshot("snapTree_columnlist",
                    "distrdf_spark_snapfile_columnlist.root", expectedcolumns)

        # Create a traditional RDF from the snapshotted files to retrieve the
        # list of columns
        tmp_files = [
            "distrdf_spark_snapfile_columnlist_0.root",
            "distrdf_spark_snapfile_columnlist_1.root"
        ]
        rdf = ROOT.RDataFrame("snapTree_columnlist", tmp_files)
        snapcolumns = [str(column) for column in rdf.GetColumnNames()]

        assert snapcolumns == expectedcolumns

        for filename in tmp_files:
            os.remove(filename)
示例#6
0
    def build_pyrdf_graph(self):
        """
        Create a DistRDF graph with a fixed set of operations and return it.
        """
        treename = "data"
        files = [
            "http://root.cern/files/teaching/CMS_Open_Dataset.root",
        ]
        rdf = Spark.RDataFrame(treename, files, npartitions=5)

        # Define the analysis cuts
        chargeCutStr = "C1 != C2"
        etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3"
        ptCutStr = "pt1 > 2 && pt2 > 2"
        rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \
                   .Filter(etaCutStr, "Central Muons") \
                   .Filter(ptCutStr, "Sane Pt")

        # Create the invariant mass column
        invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +"
                             "pow(py1+py2, 2) + pow(pz1+pz2, 2)))")
        rdf_fd = rdf_f.Define("invMass", invMassFormulaStr)

        # Create the histograms
        pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1")
        pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2")
        model = ("invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5,
                 110)
        invMass_h = rdf_fd.Histo1D(model, "invMass")
        pi = ROOT.TMath.Pi()
        model = ("", "", 64, -pi, pi, 64, -pi, pi)
        phis_h = rdf_fd.Histo2D(model, "phi1", "phi2")

        return pt1_h, pt2_h, invMass_h, phis_h
示例#7
0
    def test_distributed_sum(self, connection):
        """Test support for `Sum` operation in distributed backend"""
        rdf_py = Spark.RDataFrame(10, sparkcontext=connection)
        rdf_def = rdf_py.Define("x", "rdfentry_")
        rdf_sum = rdf_def.Sum("x")

        assert rdf_sum.GetValue() == 45.0
    def _includes_function_with_filter_and_histo(self, connection):
        """
        Check that the filter operation is able to use C++ functions that
        were included using header files.
        """

        rdf = Spark.RDataFrame(10, sparkcontext=connection)

        rdf._headnode.backend.distribute_headers("../test_headers/header1.hxx")

        # This filters out all numbers less than 5
        rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)")
        histo = rdf_filtered.Histo1D("tdfentry_")

        # The expected results after filtering
        # The actual set of numbers required after filtering
        required_numbers = range(5)
        required_size = len(required_numbers)
        required_mean = sum(required_numbers) / float(required_size)
        required_stdDev = math.sqrt(
            sum((x - required_mean)**2 for x in required_numbers) /
            required_size)

        # Compare the sizes of equivalent set of numbers
        assert histo.GetEntries() == required_size
        # Compare the means of equivalent set of numbers
        assert histo.GetMean() == required_mean
        # Compare the standard deviations of equivalent set of numbers
        assert histo.GetStdDev() == required_stdDev
示例#9
0
    def test_distributed_sum(self):
        """Test support for `Sum` operation in distributed backend"""
        rdf_py = Spark.RDataFrame(10)
        rdf_def = rdf_py.Define("x", "rdfentry_")
        rdf_sum = rdf_def.Sum("x")

        self.assertAlmostEqual(rdf_sum.GetValue(), 45.0)
示例#10
0
    def test_definepersample_simple(self, connection):
        """
        Test DefinePerSample operation on three samples using a predefined
        string of operations.
        """

        df = Spark.RDataFrame(self.maintreename,
                              self.filenames,
                              sparkcontext=connection)

        # Associate a number to each sample
        definepersample_code = """
        if(rdfsampleinfo_.Contains(\"{}\")) return 1;
        else if (rdfsampleinfo_.Contains(\"{}\")) return 2;
        else if (rdfsampleinfo_.Contains(\"{}\")) return 3;
        else return 0;
        """.format(*self.samples)

        df1 = df.DefinePerSample("sampleid", definepersample_code)

        # Filter by the sample number. Each filtered dataframe should contain
        # 10 entries, equal to the number of entries per sample
        samplescounts = [
            df1.Filter("sampleid == {}".format(id)).Count()
            for id in [1, 2, 3]
        ]

        for count in samplescounts:
            assert count.GetValue() == 10
示例#11
0
    def test_distributed_asnumpy(self):
        """Test support for `AsNumpy` pythonization in distributed backend"""

        # Let's create a simple dataframe with ten rows and two columns
        df = Spark.RDataFrame(10).Define("x", "(int)rdfentry_")\
            .Define("y", "1.f/(1.f+rdfentry_)")

        # Build a dictionary of numpy arrays.
        npy = df.AsNumpy()
        self.assertIsInstance(npy, dict)

        # Retrieve the two numpy arrays with the column names of the original
        # RDataFrame as dictionary keys.
        npy_x = npy["x"]
        npy_y = npy["y"]
        self.assertIsInstance(npy_x, numpy.ndarray)
        self.assertIsInstance(npy_y, numpy.ndarray)

        # Check the two arrays are of the same length as the original columns.
        self.assertEqual(len(npy_x), 10)
        self.assertEqual(len(npy_y), 10)

        # Check the types correspond to the ones of the original columns.
        int_32_dtype = numpy.dtype("int32")
        float_32_dtype = numpy.dtype("float32")
        self.assertEqual(npy_x.dtype, int_32_dtype)
        self.assertEqual(npy_y.dtype, float_32_dtype)
示例#12
0
    def test_write_histo(self):
        """
        Tests that an histogram is correctly written to a .root file created
        before the execution of the event loop.
        """
        self.create_tree_with_data()

        # Create a new file where the histogram will be written
        outfile = ROOT.TFile("out_file.root", "recreate")

        # Create a DistRDF RDataFrame with the parent and the friend trees
        df = Spark.RDataFrame("Events", "tree_gaus.root")

        # Create histogram
        histo = df.Histo1D(("x", "x", 100, 0, 20), "x")

        # Write histogram to out_file.root and close the file
        histo.Write()
        outfile.Close()

        # Reopen file to check that histogram was correctly stored
        reopen_file = ROOT.TFile("out_file.root", "read")
        reopen_histo = reopen_file.Get("x")

        # Check histogram statistics
        self.assertEqual(reopen_histo.GetEntries(), self.nentries)
        self.assertAlmostEqual(reopen_histo.GetMean(), self.gaus_mean,
                               delta=self.delta_equal)
        self.assertAlmostEqual(reopen_histo.GetStdDev(), self.gaus_stdev,
                               delta=self.delta_equal)

        # Remove unnecessary .root files
        os.remove("tree_gaus.root")
        os.remove("out_file.root")
示例#13
0
    def test_distributed_snapshot(self):
        """Test support for `Snapshot` in distributed backend"""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Spark.RDataFrame(10).Define("x", "rdfentry_")

        # Count rows in the dataframe
        nrows = df.Count()

        # Snapshot to two files, build a ROOT.TChain with them and retrieve a
        # Spark.RDataFrame
        snapdf = df.Snapshot("snapTree", "snapFile.root")

        # Count the rows in the snapshotted dataframe
        snapcount = snapdf.Count()

        self.assertEqual(nrows.GetValue(), 10)
        self.assertEqual(snapcount.GetValue(), 10)

        # Retrieve list of file from the snapshotted dataframe
        input_files = snapdf.proxied_node.get_inputfiles()
        # Create list of supposed filenames for the intermediary files
        tmp_files = ["snapFile_0_4.root", "snapFile_5_9.root"]
        # Check that the two lists are the same
        self.assertListEqual(input_files, tmp_files)
        # Check that the intermediary .root files were created with the right
        # names, then remove them because they are not necessary
        for filename in tmp_files:
            self.assertTrue(os.path.exists(filename))
            os.remove(filename)
示例#14
0
    def test_distributed_asnumpy(self, connection):
        """Test support for `AsNumpy` pythonization in distributed backend"""

        # Let's create a simple dataframe with ten rows and two columns
        df = Spark.RDataFrame(10, sparkcontext=connection).Define("x", "(int)rdfentry_")\
            .Define("y", "1.f/(1.f+rdfentry_)")

        # Build a dictionary of numpy arrays.
        npy = df.AsNumpy()
        self.check_npy_dict(npy)
示例#15
0
    def test_distributed_snapshot(self, connection):
        """Test support for `Snapshot` in distributed backend"""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Spark.RDataFrame(10, sparkcontext=connection).Define(
            "x", "rdfentry_")

        # Snapshot to two files, build a ROOT.TChain with them and retrieve a
        # Spark.RDataFrame
        snapdf = df.Snapshot("snapTree", "snapFile.root")
        self.check_snapshot_df(snapdf, "snapFile")
示例#16
0
    def test_user_supplied_npartitions_have_precedence(self, connection):
        """
        The SparkContext of this class has 2 cores available. The
        `SparkBackend.optimize_npartitions` method would return 2.
        Check that if the user specifies a number of partitions, this
        is not overwritten by the backend.
        """

        df = Spark.RDataFrame(100, sparkcontext=connection, npartitions=4)

        # The number of partitions was supplied by the user.
        assert df._headnode.npartitions == 4
示例#17
0
    def test_histo1d_merge(self):
        """Check the working of Histo1D merge operation in the reducer."""
        # Operations with DistRDF
        rdf_py = Spark.RDataFrame(10)
        histo_py = rdf_py.Histo1D("rdfentry_")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        histo_cpp = rdf_cpp.Histo1D("rdfentry_")

        # Compare the 2 histograms
        self.assertHistoOrProfile(histo_py, histo_cpp)
示例#18
0
    def test_varyfiltersum(self, connection):
        df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1")
        df_sum = df.Vary("x", "ROOT::RVecI{-1*x, 2*x}", ("down", "up"), "myvariation").Filter("x > 0").Sum("x")

        assert df_sum.GetValue() == 10

        sums = DistRDF.VariationsFor(df_sum)

        expectednames = ["nominal", "myvariation:down", "myvariation:up"]
        expectedsums = [10, 0, 20]
        for varname, val in zip(expectednames, expectedsums):
            assert sums[varname] == val
示例#19
0
    def test_histo(self, connection):
        df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1")
        df1 = df.Vary("x", "ROOT::RVecI{-2,2}", ["down", "up"])
        h = df1.Histo1D("x")
        histos = DistRDF.VariationsFor(h)

        expectednames = ["nominal", "x:up", "x:down"]
        expectedmeans = [1, 2, -2]
        for varname, mean in zip(expectednames, expectedmeans):
            histo = histos[varname]
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetEntries() == 10
            assert histo.GetMean() == mean
示例#20
0
    def test_mixed(self, connection):
        df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1").Define("y", "42")
        h = df.Vary("x", "ROOT::RVecI{-1, 2}", variationTags=["down", "up"]).Histo1D("x", "y")
        histos = DistRDF.VariationsFor(h)

        expectednames = ["nominal", "x:down", "x:up"]
        expectedmeans = [1, -1, 2]
        expectedmax = 420
        for varname, mean in zip(expectednames, expectedmeans):
            histo = histos[varname]
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetMaximum() == expectedmax
            assert histo.GetMean() == mean
示例#21
0
    def test_graph(self, connection):
        df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1")
        g = df.Vary("x", "ROOT::RVecI{-1, 2}", nVariations=2).Graph("x", "x")
        gs = DistRDF.VariationsFor(g)

        assert g.GetMean() == 1

        expectednames = ["nominal", "x:0", "x:1"]
        expectedmeans = [1, -1, 2]
        for varname, mean in zip(expectednames, expectedmeans):
            graph = gs[varname]
            assert isinstance(graph, ROOT.TGraph)
            assert graph.GetMean() == mean
示例#22
0
    def test_histo_from_empty_root_file(self):
        """
        Check that when performing operations with the distributed backend on
        an RDataFrame without entries, DistRDF raises an error.
        """

        # Create an RDataFrame from a file with an empty tree
        rdf = Spark.RDataFrame("NOMINAL", "emptytree.root")
        histo = rdf.Histo1D(("empty", "empty", 10, 0, 10), "mybranch")

        # Get entries in the histogram, raises error
        with self.assertRaises(RuntimeError):
            histo.GetEntries()
示例#23
0
    def test_user_supplied_npartitions_have_precedence(self):
        """
        Check that even if spark configuration options could optimize the number
        of partitions, a user supplied value for npartitions takes precedence.
        """

        conf = {"spark.executor.cores": 4, "spark.executor.instances": 4}
        sconf = pyspark.SparkConf().setAll(conf.items())
        scontext = pyspark.SparkContext(conf=sconf)

        df = Spark.RDataFrame(100, sparkcontext=scontext, npartitions=4)

        # The number of partitions was supplied by the user.
        self.assertEqual(df._headnode.npartitions, 4)
示例#24
0
    def test_count_with_same_tree_repeated(self, connection):
        """
        Count entries of a dataset with three times the same tree.
        """
        df = ROOT.RDataFrame(100).Define("x", "1")
        treename = "tree"
        filename = "distrdf_roottest_spark_check_backend_same_tree.root"
        filenames = [filename] * 3
        df.Snapshot(treename, filename, ["x"])

        rdf = Spark.RDataFrame(treename, filenames, sparkcontext=connection)
        assert rdf.Count().GetValue() == 300

        os.remove(filename)
示例#25
0
    def test_profile1d_merge(self):
        """Check the working of Profile1D merge operation in the reducer."""
        # Operations with DistRDF
        rdf_py = Spark.RDataFrame(10)
        columns_py = self.define_two_columns(rdf_py)
        profile_py = columns_py.Profile1D(("", "", 64, -4, 4), "x", "y")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        columns_cpp = self.define_two_columns(rdf_cpp)
        profile_cpp = columns_cpp.Profile1D(("", "", 64, -4, 4), "x", "y")

        # Compare the 2 profiles
        self.assertHistoOrProfile(profile_py, profile_cpp)
示例#26
0
    def test_redefine_one_column(self, connection):
        """Test that values of one column can be properly redefined."""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Spark.RDataFrame(10, sparkcontext=connection)
        df_before = df.Define("x", "1")
        df_after = df_before.Redefine("x", "2")

        # Initial sum should be equal to 10
        sum_before = df_before.Sum("x")
        # Sum after the redefinition should be equal to 20
        sum_after = df_after.Sum("x")

        assert sum_before.GetValue() == 10.0
        assert sum_after.GetValue() == 20.0
示例#27
0
    def test_friends_tchain_noname_add_fullpath_addfriend_alias(self):
        """Test against the reproducer of issue https://github.com/root-project/root/issues/7584"""

        rn1 = "rn1.root"
        rn2 = "rn2.root"
        friendsfilename = "friendtrees_spark.root"

        df_1 = ROOT.RDataFrame(10000)
        df_2 = ROOT.RDataFrame(10000)

        df_1 = df_1.Define("rnd", "gRandom->Gaus(10)")
        df_2 = df_2.Define("rnd", "gRandom->Gaus(20)")

        df_1.Snapshot("randomNumbers", rn1)
        df_2.Snapshot("randomNumbersBis", rn2)

        # Put the two trees together in a common file
        subprocess.run("hadd -f {} {} {}".format(friendsfilename, rn1, rn2),
                       shell=True,
                       check=True)

        # Test the specific case of a parent chain and friend chain with no
        # names, that receive one tree each in the form "filename/treename". The
        # friend is then added to the parent with an alias.
        chain = ROOT.TChain()
        chainFriend = ROOT.TChain()

        chain.Add("friendtrees_spark.root/randomNumbers")
        chainFriend.Add("friendtrees_spark.root/randomNumbersBis")

        chain.AddFriend(chainFriend, "myfriend")

        df = Spark.RDataFrame(chain)

        h_parent = df.Histo1D("rnd")
        h_friend = df.Histo1D("myfriend.rnd")

        self.assertEqual(h_parent.GetEntries(), 10000)
        self.assertEqual(h_friend.GetEntries(), 10000)

        self.assertAlmostEqual(h_parent.GetMean(), 10, delta=0.01)
        self.assertAlmostEqual(h_friend.GetMean(), 20, delta=0.01)

        self.assertAlmostEqual(h_parent.GetStdDev(), 1, delta=0.01)
        self.assertAlmostEqual(h_friend.GetStdDev(), 1, delta=0.01)

        os.remove(rn1)
        os.remove(rn2)
        os.remove(friendsfilename)
示例#28
0
    def test_rungraphs_sparkanddask_3histos(self, connection):
        """
        Submit three different RDF graphs concurrently to Spark and Dask
        """
        daskconn, sparkconn = connection
        # Create a test file for processing
        treename = "myTree"
        filename = "2clusters.root"
        nentries = 10000
        opts = ROOT.RDF.RSnapshotOptions()
        opts.fAutoFlush = 5000
        ROOT.RDataFrame(nentries).Define("b1", "42")\
                                 .Define("b2", "42")\
                                 .Define("b3", "42")\
                                 .Snapshot(treename, filename, ["b1", "b2", "b3"], opts)

        histoproxies_spark = [
            Spark.RDataFrame(treename,
                             filename,
                             sparkcontext=sparkconn,
                             npartitions=2).Histo1D((col, col, 1, 40, 45), col)
            for col in ["b1", "b2", "b3"]
        ]

        histoproxies_dask = [
            Dask.RDataFrame(treename,
                            filename,
                            daskclient=daskconn,
                            npartitions=2).Histo1D((col, col, 1, 40, 45), col)
            for col in ["b1", "b2", "b3"]
        ]

        histoproxies = histoproxies_spark + histoproxies_dask

        # Before triggering the computation graphs values are None
        for proxy in histoproxies:
            assert proxy.proxied_node.value is None

        DistRDF.RunGraphs(histoproxies)

        # After RunGraphs all histograms are correctly assigned to the
        # node objects
        for proxy in histoproxies:
            histo = proxy.proxied_node.value
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetEntries() == nentries
            assert histo.GetMean() == 42

        os.remove(filename)
示例#29
0
    def test_histo3d_merge(self):
        """Check the working of Histo3D merge operation in the reducer."""
        modelTH3D = ("", "", 64, -4, 4, 64, -4, 4, 64, -4, 4)
        # Operations with DistRDF
        rdf_py = Spark.RDataFrame(10)
        columns_py = self.define_three_columns(rdf_py)
        histo_py = columns_py.Histo3D(modelTH3D, "x", "y", "z")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        columns_cpp = self.define_three_columns(rdf_cpp)
        histo_cpp = columns_cpp.Histo3D(modelTH3D, "x", "y", "z")

        # Compare the 2 histograms
        self.assertHistoOrProfile(histo_py, histo_cpp)
示例#30
0
    def test_distributed_asnumpy_lazy(self, connection):
        """Test that `AsNumpy` can be still called lazily in distributed mode"""

        # Let's create a simple dataframe with ten rows and two columns
        df = Spark.RDataFrame(10, sparkcontext=connection).Define("x", "(int)rdfentry_")\
            .Define("y", "1.f/(1.f+rdfentry_)")

        npy_lazy = df.AsNumpy(lazy=True)
        # The event loop hasn't been triggered yet
        assert isinstance(npy_lazy, ActionProxy)
        assert npy_lazy.proxied_node.value is None

        # Trigger the computations and check final results
        npy = npy_lazy.GetValue()
        self.check_npy_dict(npy)