def test_tchain_with_friend_tchain_histo(self, connection): """ Tests that the computational graph can be issued both on the parent chain and the friend chain. """ main_filename = "main_chain.root" friend_filename = "friend_chain.root" self.create_parent_tree(main_filename) self.create_friend_tree(friend_filename) # Main TChain mainchain = ROOT.TChain("T") mainchain.Add(main_filename) # Friend TChain friendchain = ROOT.TChain("TF") friendchain.Add(friend_filename) # Add friend chain to the main one mainchain.AddFriend(friendchain) # Create a DistRDF RDataFrame with the main and the friend chains df = Dask.RDataFrame(mainchain, daskclient=connection) # Create histograms h_parent = df.Histo1D("x") h_friend = df.Histo1D("TF.x") check_histograms(h_parent, h_friend) # Remove unnecessary .root files os.remove(main_filename) os.remove(friend_filename)
def test_write_histo(self, connection): """ Tests that an histogram is correctly written to a .root file created before the execution of the event loop. """ self.create_tree_with_data() # Create a new file where the histogram will be written outfile = ROOT.TFile("out_file.root", "recreate") # Create a DistRDF RDataFrame with the parent and the friend trees df = Dask.RDataFrame("Events", "tree_gaus.root", daskclient=connection) # Create histogram histo = df.Histo1D(("x", "x", 100, 0, 20), "x") # Write histogram to out_file.root and close the file histo.Write() outfile.Close() # Reopen file to check that histogram was correctly stored reopen_file = ROOT.TFile("out_file.root", "read") reopen_histo = reopen_file.Get("x") # Check histogram statistics assert reopen_histo.GetEntries() == self.nentries assert reopen_histo.GetMean() == pytest.approx(self.gaus_mean, self.delta_equal) assert reopen_histo.GetStdDev() == pytest.approx( self.gaus_stdev, self.delta_equal) # Remove unnecessary .root files os.remove("tree_gaus.root") os.remove("out_file.root")
def build_distrdf_graph(self, connection): """ Create a DistRDF graph with a fixed set of operations and return it. """ treename = "data" files = ["http://root.cern/files/teaching/CMS_Open_Dataset.root", ] rdf = Dask.RDataFrame(treename, files, npartitions=5, daskclient=connection) # Define the analysis cuts chargeCutStr = "C1 != C2" etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3" ptCutStr = "pt1 > 2 && pt2 > 2" rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \ .Filter(etaCutStr, "Central Muons") \ .Filter(ptCutStr, "Sane Pt") # Create the invariant mass column invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +" "pow(py1+py2, 2) + pow(pz1+pz2, 2)))") rdf_fd = rdf_f.Define("invMass", invMassFormulaStr) # Create the histograms pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1") pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2") model = ( "invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5, 110) invMass_h = rdf_fd.Histo1D(model, "invMass") pi = ROOT.TMath.Pi() model = ("", "", 64, -pi, pi, 64, -pi, pi) phis_h = rdf_fd.Histo2D(model, "phi1", "phi2") return pt1_h, pt2_h, invMass_h, phis_h
def _extend_ROOT_include_path(self, connection): """ Check that the include path of ROOT is extended with the directories specified in `DistRDF.include_headers()` so references between headers are correctly solved. """ # Create an RDataFrame with 100 integers from 0 to 99 rdf = Dask.RDataFrame(100, daskclient=connection) # Distribute headers to the workers header_folder = "../test_headers/headers_folder" rdf._headnode.backend.distribute_headers(header_folder) # Get list of include paths seen by ROOT ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ") # Create new include folder token new_folder_include = "-I\"{}\"".format(header_folder) # Check that new folder is in ROOT include paths assert new_folder_include in ROOT_include_path # Filter numbers less than 10 and create an histogram rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)") histo1 = rdf_less_than_10.Histo1D("tdfentry_") # Check that histogram has 10 entries and mean 4.5 assert histo1.GetEntries() == 10 assert histo1.GetMean() == pytest.approx(4.5)
def test_definepersample_simple(self, connection): """ Test DefinePerSample operation on three samples using a predefined string of operations. """ df = Dask.RDataFrame(self.maintreename, self.filenames, daskclient=connection) # Associate a number to each sample definepersample_code = """ if(rdfsampleinfo_.Contains(\"{}\")) return 1; else if (rdfsampleinfo_.Contains(\"{}\")) return 2; else if (rdfsampleinfo_.Contains(\"{}\")) return 3; else return 0; """.format(*self.samples) df1 = df.DefinePerSample("sampleid", definepersample_code) # Filter by the sample number. Each filtered dataframe should contain # 10 entries, equal to the number of entries per sample samplescounts = [ df1.Filter("sampleid == {}".format(id)).Count() for id in [1, 2, 3] ] for count in samplescounts: assert count.GetValue() == 10
def _includes_function_with_filter_and_histo(self, connection): """ Check that the filter operation is able to use C++ functions that were included using header files. """ rdf = Dask.RDataFrame(10, daskclient=connection) rdf._headnode.backend.distribute_headers("../test_headers/header1.hxx") # This filters out all numbers less than 5 rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)") histo = rdf_filtered.Histo1D("tdfentry_") # The expected results after filtering # The actual set of numbers required after filtering required_numbers = range(5) required_size = len(required_numbers) required_mean = sum(required_numbers) / float(required_size) required_stdDev = math.sqrt( sum((x - required_mean)**2 for x in required_numbers) / required_size) # Compare the sizes of equivalent set of numbers assert histo.GetEntries() == required_size # Compare the means of equivalent set of numbers assert histo.GetMean() == required_mean # Compare the standard deviations of equivalent set of numbers assert histo.GetStdDev() == required_stdDev
def test_distributed_snapshot_columnlist(self, connection): """ Test that distributed Snapshot correctly passes also the third input argument "columnList". """ # A simple dataframe with ten sequential numbers from 0 to 9 df = Dask.RDataFrame(10, daskclient=connection)\ .Define("a", "rdfentry_")\ .Define("b", "rdfentry_")\ .Define("c", "rdfentry_")\ .Define("d", "rdfentry_") expectedcolumns = ["a", "b"] df.Snapshot("snapTree_columnlist", "distrdf_dask_snapfile_columnlist.root", expectedcolumns) # Create a traditional RDF from the snapshotted files to retrieve the # list of columns tmp_files = ["distrdf_dask_snapfile_columnlist_0.root", "distrdf_dask_snapfile_columnlist_1.root"] rdf = ROOT.RDataFrame("snapTree_columnlist", tmp_files) snapcolumns = [str(column) for column in rdf.GetColumnNames()] assert snapcolumns == expectedcolumns for filename in tmp_files: os.remove(filename)
def test_definepersample_withinitialization(self, connection): """ Test DefinePerSample operation on three samples using C++ functions declared to the ROOT interpreter. """ # Write initialization code that will be run in the workers to make the # needed functions available def declare_definepersample_code(): ROOT.gInterpreter.Declare(''' #ifndef distrdf_test_definepersample_withinitialization #define distrdf_test_definepersample_withinitialization float sample1_weight(){ return 1.0f; } float sample2_weight(){ return 2.0f; } float sample3_weight(){ return 3.0f; } float samples_weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){ if (id.Contains("sample1")){ return sample1_weight(); } else if (id.Contains("sample2")){ return sample2_weight(); } else if (id.Contains("sample3")){ return sample3_weight(); } return -999.0f; } std::string samples_names(unsigned int slot, const ROOT::RDF::RSampleInfo &id){ return id.AsString(); } #endif // distrdf_test_definepersample_withinitialization ''') DistRDF.initialize(declare_definepersample_code) df = Dask.RDataFrame(self.maintreename, self.filenames, daskclient=connection) df1 = df.DefinePerSample("sample_weight", "samples_weights(rdfslot_, rdfsampleinfo_)")\ .DefinePerSample("sample_name", "samples_names(rdfslot_, rdfsampleinfo_)") # Filter by the two defined columns per sample: a weight and the sample string representation # Each filtered dataset should have 10 entries, equal to the number of entries per sample weightsandnames = [("1.0f", "sample1.root/Events"), ("2.0f", "sample2.root/Events"), ("3.0f", "sample3.root/Events")] samplescounts = [ df1.Filter("sample_weight == {} && sample_name == \"{}\"".format( weight, name)).Count() for (weight, name) in weightsandnames ] for count in samplescounts: assert count.GetValue() == 10
def test_initialization_method(self, connection): """ Check `DistRDF.initialize` with Dask backend. Defines an integer value to the ROOT interpreter. Check that this value is available in the worker processes. """ def init(value): import ROOT cpp_code = f"int userValue = {value};" ROOT.gInterpreter.ProcessLine(cpp_code) DistRDF.initialize(init, 123) # Dask backend has a limited list of supported methods, so we use # Histo1D which is a supported action. # The code below creates an RDataFrame instance with one single entry # and defines a column 'u' whose value is taken from the variable # 'userValue'. # This variable is only declared inside the ROOT interpreter, however # the value of the variable is passed by the user from the python side. # If the init function defined by the user is properly propagated to the # Dask backend, each workers will run the init function as a first step # and hence the variable 'userValue' will be defined at runtime. # As a result the define operation should read the variable 'userValue' # and assign it to the entries of the column 'u' (only one entry). # Finally, Histo1D returns a histogram filled with one value. The mean # of this single value has to be the value itself, independently of # the number of spawned workers. df = Dask.RDataFrame(1, daskclient=connection).Define( "u", "userValue").Histo1D("u") h = df.GetValue() assert h.GetMean() == 123
def test_distributed_sum(self, connection): """Test support for `Sum` operation in distributed backend""" rdf_py = Dask.RDataFrame(10, daskclient=connection) rdf_def = rdf_py.Define("x", "rdfentry_") rdf_sum = rdf_def.Sum("x") assert rdf_sum.GetValue() == 45.0
def test_distributed_snapshot(self, connection): """Test support for `Snapshot` in distributed backend""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Dask.RDataFrame(10, daskclient=connection).Define("x", "rdfentry_") # Snapshot to two files, build a ROOT.TChain with them and retrieve a # Dask.RDataFrame snapdf = df.Snapshot("snapTree", "snapFile.root") self.check_snapshot_df(snapdf, "snapFile")
def test_distributed_asnumpy(self, connection): """Test support for `AsNumpy` pythonization in distributed backend""" # Let's create a simple dataframe with ten rows and two columns df = Dask.RDataFrame(10, daskclient=connection).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") # Build a dictionary of numpy arrays. npy = df.AsNumpy() self.check_npy_dict(npy)
def test_user_supplied_npartitions_have_precedence(self, connection): """ The class Client object is connected to a LocalCluster with 2 processes. The `DaskBackend.optimize_npartitions` method would thus return 2. Check that if the user specifies a number of partitions, that is not overwritten by the backend. """ df = Dask.RDataFrame(100, daskclient=connection, npartitions=4) # The number of partitions was supplied by the user. assert df._headnode.npartitions == 4
def test_histo1d_merge(self, connection): """Check the working of Histo1D merge operation in the reducer.""" # Operations with DistRDF rdf_py = Dask.RDataFrame(10, daskclient=connection) histo_py = rdf_py.Histo1D("rdfentry_") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) histo_cpp = rdf_cpp.Histo1D("rdfentry_") # Compare the 2 histograms self.assertHistoOrProfile(histo_py, histo_cpp)
def test_redefine_one_column(self, connection): """Test that values of one column can be properly redefined.""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Dask.RDataFrame(10, daskclient=connection) df_before = df.Define("x", "1") df_after = df_before.Redefine("x", "2") # Initial sum should be equal to 10 sum_before = df_before.Sum("x") # Sum after the redefinition should be equal to 20 sum_after = df_after.Sum("x") assert sum_before.GetValue() == 10.0 assert sum_after.GetValue() == 20.0
def test_distributed_snapshot_lazy(self, connection): """Test that `Snapshot` can be still called lazily in distributed mode""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Dask.RDataFrame(10, daskclient=connection).Define("x", "rdfentry_") opts = ROOT.RDF.RSnapshotOptions() opts.fLazy = True snap_lazy = df.Snapshot("snapTree_lazy", "snapFile_lazy.root", ["x"], opts) # The event loop hasn't been triggered yet assert isinstance(snap_lazy, ActionProxy) assert snap_lazy.proxied_node.value is None snapdf = snap_lazy.GetValue() self.check_snapshot_df(snapdf, "snapFile_lazy")
def test_count_with_same_tree_repeated(self, connection): """ Count entries of a dataset with three times the same tree. """ df = ROOT.RDataFrame(100).Define("x", "1") treename = "tree" filename = "distrdf_roottest_dask_check_backend_same_tree.root" filenames = [filename] * 3 df.Snapshot(treename, filename, ["x"]) rdf = Dask.RDataFrame(treename, filenames, daskclient=connection) assert rdf.Count().GetValue() == 300 os.remove(filename)
def test_profile1d_merge(self, connection): """Check the working of Profile1D merge operation in the reducer.""" # Operations with DistRDF rdf_py = Dask.RDataFrame(10, daskclient=connection) columns_py = self.define_two_columns(rdf_py) profile_py = columns_py.Profile1D(("", "", 64, -4, 4), "x", "y") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) columns_cpp = self.define_two_columns(rdf_cpp) profile_cpp = columns_cpp.Profile1D(("", "", 64, -4, 4), "x", "y") # Compare the 2 profiles self.assertHistoOrProfile(profile_py, profile_cpp)
def test_varyfiltersum(self, connection): df = Dask.RDataFrame(10, daskclient=connection, npartitions=2).Define("x", "1") df_sum = df.Vary("x", "ROOT::RVecI{-1*x, 2*x}", ("down", "up"), "myvariation").Filter("x > 0").Sum("x") assert df_sum.GetValue() == 10 sums = DistRDF.VariationsFor(df_sum) expectednames = ["nominal", "myvariation:down", "myvariation:up"] expectedsums = [10, 0, 20] for varname, val in zip(expectednames, expectedsums): assert sums[varname] == val
def test_graph(self, connection): df = Dask.RDataFrame(10, daskclient=connection, npartitions=2).Define("x", "1") g = df.Vary("x", "ROOT::RVecI{-1, 2}", nVariations=2).Graph("x", "x") gs = DistRDF.VariationsFor(g) assert g.GetMean() == 1 expectednames = ["nominal", "x:0", "x:1"] expectedmeans = [1, -1, 2] for varname, mean in zip(expectednames, expectedmeans): graph = gs[varname] assert isinstance(graph, ROOT.TGraph) assert graph.GetMean() == mean
def test_histo(self, connection): df = Dask.RDataFrame(10, daskclient=connection, npartitions=2).Define("x", "1") df1 = df.Vary("x", "ROOT::RVecI{-2,2}", ["down", "up"]) h = df1.Histo1D("x") histos = DistRDF.VariationsFor(h) expectednames = ["nominal", "x:up", "x:down"] expectedmeans = [1, 2, -2] for varname, mean in zip(expectednames, expectedmeans): histo = histos[varname] assert isinstance(histo, ROOT.TH1D) assert histo.GetEntries() == 10 assert histo.GetMean() == mean
def test_rungraphs_sparkanddask_3histos(self, connection): """ Submit three different RDF graphs concurrently to Spark and Dask """ daskconn, sparkconn = connection # Create a test file for processing treename = "myTree" filename = "2clusters.root" nentries = 10000 opts = ROOT.RDF.RSnapshotOptions() opts.fAutoFlush = 5000 ROOT.RDataFrame(nentries).Define("b1", "42")\ .Define("b2", "42")\ .Define("b3", "42")\ .Snapshot(treename, filename, ["b1", "b2", "b3"], opts) histoproxies_spark = [ Spark.RDataFrame(treename, filename, sparkcontext=sparkconn, npartitions=2).Histo1D((col, col, 1, 40, 45), col) for col in ["b1", "b2", "b3"] ] histoproxies_dask = [ Dask.RDataFrame(treename, filename, daskclient=daskconn, npartitions=2).Histo1D((col, col, 1, 40, 45), col) for col in ["b1", "b2", "b3"] ] histoproxies = histoproxies_spark + histoproxies_dask # Before triggering the computation graphs values are None for proxy in histoproxies: assert proxy.proxied_node.value is None DistRDF.RunGraphs(histoproxies) # After RunGraphs all histograms are correctly assigned to the # node objects for proxy in histoproxies: histo = proxy.proxied_node.value assert isinstance(histo, ROOT.TH1D) assert histo.GetEntries() == nentries assert histo.GetMean() == 42 os.remove(filename)
def test_mixed(self, connection): df = Dask.RDataFrame(10, daskclient=connection, npartitions=2).Define("x", "1").Define("y", "42") h = df.Vary("x", "ROOT::RVecI{-1, 2}", variationTags=["down", "up"]).Histo1D("x", "y") histos = DistRDF.VariationsFor(h) expectednames = ["nominal", "x:down", "x:up"] expectedmeans = [1, -1, 2] expectedmax = 420 for varname, mean in zip(expectednames, expectedmeans): histo = histos[varname] assert isinstance(histo, ROOT.TH1D) assert histo.GetMaximum() == expectedmax assert histo.GetMean() == mean
def test_histo_from_empty_root_file(self, connection): """ Check that when performing operations with the distributed backend on an RDataFrame without entries, DistRDF raises an error. """ # Create an RDataFrame from a file with an empty tree rdf = Dask.RDataFrame("NOMINAL", "../emptytree.root", daskclient=connection) histo = rdf.Histo1D(("empty", "empty", 10, 0, 10), "mybranch") # Get entries in the histogram, raises error with pytest.raises(RuntimeError): histo.GetEntries()
def test_distributed_asnumpy_lazy(self, connection): """Test that `AsNumpy` can be still called lazily in distributed mode""" # Let's create a simple dataframe with ten rows and two columns df = Dask.RDataFrame(10, daskclient=connection).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") npy_lazy = df.AsNumpy(lazy=True) # The event loop hasn't been triggered yet assert isinstance(npy_lazy, ActionProxy) assert npy_lazy.proxied_node.value is None # Trigger the computations and check final results npy = npy_lazy.GetValue() self.check_npy_dict(npy)
def test_initialization(self, connection): """ Check that the user initialization method is assigned to the current backend. """ def returnNumber(n): return n DistRDF.initialize(returnNumber, 123) # Dummy df just to retrieve the initialization function df = Dask.RDataFrame(10, daskclient=connection) f = df._headnode.backend.initialization assert f() == 123
def test_histo3d_merge(self, connection): """Check the working of Histo3D merge operation in the reducer.""" modelTH3D = ("", "", 64, -4, 4, 64, -4, 4, 64, -4, 4) # Operations with DistRDF rdf_py = Dask.RDataFrame(10, daskclient=connection) columns_py = self.define_three_columns(rdf_py) histo_py = columns_py.Histo3D(modelTH3D, "x", "y", "z") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) columns_cpp = self.define_three_columns(rdf_cpp) histo_cpp = columns_cpp.Histo3D(modelTH3D, "x", "y", "z") # Compare the 2 histograms self.assertHistoOrProfile(histo_py, histo_cpp)
def test_simultaneous(self, connection): df = Dask.RDataFrame(10, daskclient=connection, npartitions=2).Define("x", "1").Define("y", "42") h = df.Vary(["x", "y"], "ROOT::RVec<ROOT::RVecI>{{-1, 2, 3}, {41, 43, 44}}", ["down", "up", "other"], "xy").Histo1D("x", "y") histos = DistRDF.VariationsFor(h) expectednames = ["nominal", "xy:down", "xy:up", "xy:other"] expectedmeans = [1, -1, 2, 3] expectedmax = [420, 410, 430, 440] for varname, mean, maxval in zip(expectednames, expectedmeans, expectedmax): graph = histos[varname] assert isinstance(graph, ROOT.TH1D) assert graph.GetMaximum() == maxval assert graph.GetMean() == mean
def test_histond_merge(self, connection): """Check the working of HistoND merge operation in the reducer.""" nbins = (10, 10, 10, 10) xmin = (0., 0., 0., 0.) xmax = (100., 100., 100., 100.) modelTHND = ("name", "title", 4, nbins, xmin, xmax) colnames = ("x0", "x1", "x2", "x3") distrdf = Dask.RDataFrame(100, daskclient=connection) rdf = ROOT.RDataFrame(100) distrdf_withcols = self.define_four_columns(distrdf, colnames) rdf_withcols = self.define_four_columns(rdf, colnames) histond_distrdf = distrdf_withcols.HistoND(modelTHND, colnames) histond_rdf = rdf_withcols.HistoND(modelTHND, colnames) assert histond_distrdf.GetEntries() == histond_rdf.GetEntries() assert histond_distrdf.GetNbins() == histond_rdf.GetNbins()
def test_friends_tchain_noname_add_fullpath_addfriend_alias( self, connection): """Test against the reproducer of issue https://github.com/root-project/root/issues/7584""" rn1 = "rn1.root" rn2 = "rn2.root" friendsfilename = "friendtrees_dask.root" df_1 = ROOT.RDataFrame(10000) df_2 = ROOT.RDataFrame(10000) df_1 = df_1.Define("rnd", "gRandom->Gaus(10)") df_2 = df_2.Define("rnd", "gRandom->Gaus(20)") df_1.Snapshot("randomNumbers", rn1) df_2.Snapshot("randomNumbersBis", rn2) # Put the two trees together in a common file subprocess.run("hadd -f {} {} {}".format(friendsfilename, rn1, rn2), shell=True, check=True) # Test the specific case of a parent chain and friend chain with no # names, that receive one tree each in the form "filename/treename". The # friend is then added to the parent with an alias. chain = ROOT.TChain() chainFriend = ROOT.TChain() chain.Add("friendtrees_dask.root/randomNumbers") chainFriend.Add("friendtrees_dask.root/randomNumbersBis") chain.AddFriend(chainFriend, "myfriend") df = Dask.RDataFrame(chain, daskclient=connection) h_parent = df.Histo1D("rnd") h_friend = df.Histo1D("myfriend.rnd") check_histograms(h_parent, h_friend) os.remove(rn1) os.remove(rn2) os.remove(friendsfilename)