def test_definepersample_withinitialization(self, connection): """ Test DefinePerSample operation on three samples using C++ functions declared to the ROOT interpreter. """ # Write initialization code that will be run in the workers to make the # needed functions available def declare_definepersample_code(): ROOT.gInterpreter.Declare(''' #ifndef distrdf_test_definepersample_withinitialization #define distrdf_test_definepersample_withinitialization float sample1_weight(){ return 1.0f; } float sample2_weight(){ return 2.0f; } float sample3_weight(){ return 3.0f; } float samples_weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){ if (id.Contains("sample1")){ return sample1_weight(); } else if (id.Contains("sample2")){ return sample2_weight(); } else if (id.Contains("sample3")){ return sample3_weight(); } return -999.0f; } std::string samples_names(unsigned int slot, const ROOT::RDF::RSampleInfo &id){ return id.AsString(); } #endif // distrdf_test_definepersample_withinitialization ''') DistRDF.initialize(declare_definepersample_code) df = Spark.RDataFrame(self.maintreename, self.filenames, sparkcontext=connection) df1 = df.DefinePerSample("sample_weight", "samples_weights(rdfslot_, rdfsampleinfo_)")\ .DefinePerSample("sample_name", "samples_names(rdfslot_, rdfsampleinfo_)") # Filter by the two defined columns per sample: a weight and the sample string representation # Each filtered dataset should have 10 entries, equal to the number of entries per sample weightsandnames = [("1.0f", "sample1.root/Events"), ("2.0f", "sample2.root/Events"), ("3.0f", "sample3.root/Events")] samplescounts = [ df1.Filter("sample_weight == {} && sample_name == \"{}\"".format( weight, name)).Count() for (weight, name) in weightsandnames ] for count in samplescounts: assert count.GetValue() == 10
def _extend_ROOT_include_path(self, connection): """ Check that the include path of ROOT is extended with the directories specified in `DistRDF.include_headers()` so references between headers are correctly solved. """ # Create an RDataFrame with 100 integers from 0 to 99 rdf = Spark.RDataFrame(100, sparkcontext=connection) # Distribute headers to the workers header_folder = "../test_headers/headers_folder" rdf._headnode.backend.distribute_headers(header_folder) # Get list of include paths seen by ROOT ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ") # Create new include folder token new_folder_include = "-I\"{}\"".format(header_folder) # Check that new folder is in ROOT include paths assert new_folder_include in ROOT_include_path # Filter numbers less than 10 and create an histogram rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)") histo1 = rdf_less_than_10.Histo1D("tdfentry_") # Check that histogram has 10 entries and mean 4.5 assert histo1.GetEntries() == 10 assert histo1.GetMean() == pytest.approx(4.5)
def test_tchain_with_friend_tchain_histo(self, connection): """ Tests that the computational graph can be issued both on the parent chain and the friend chain. """ main_filename = "main_chain.root" friend_filename = "friend_chain.root" self.create_parent_tree(main_filename) self.create_friend_tree(friend_filename) # Main TChain mainchain = ROOT.TChain("T") mainchain.Add(main_filename) # Friend TChain friendchain = ROOT.TChain("TF") friendchain.Add(friend_filename) # Add friend chain to the main one mainchain.AddFriend(friendchain) # Create a DistRDF RDataFrame with the main and the friend chains df = Spark.RDataFrame(mainchain, sparkcontext=connection) # Create histograms h_parent = df.Histo1D("x") h_friend = df.Histo1D("TF.x") check_histograms(h_parent, h_friend) # Remove unnecessary .root files os.remove(main_filename) os.remove(friend_filename)
def test_initialization_method(self, connection): """ Check `DistRDF.initialize` with Spark backend. Defines an integer value to the ROOT interpreter. Check that this value is available in the worker processes. """ def init(value): import ROOT cpp_code = f"int userValue = {value};" ROOT.gInterpreter.ProcessLine(cpp_code) DistRDF.initialize(init, 123) # Spark backend has a limited list of supported methods, so we use # Histo1D which is a supported action. # The code below creates an RDataFrame instance with one single entry # and defines a column 'u' whose value is taken from the variable # 'userValue'. # This variable is only declared inside the ROOT interpreter, however # the value of the variable is passed by the user from the python side. # If the init function defined by the user is properly propagated to the # Spark backend, each workers will run the init function as a first step # and hence the variable 'userValue' will be defined at runtime. # As a result the define operation should read the variable 'userValue' # and assign it to the entries of the column 'u' (only one entry). # Finally, Histo1D returns a histogram filled with one value. The mean # of this single value has to be the value itself, independently of # the number of spawned workers. df = Spark.RDataFrame(1, sparkcontext=connection).Define( "u", "userValue").Histo1D("u") h = df.GetValue() assert h.GetMean() == 123
def test_distributed_snapshot_columnlist(self, connection): """ Test that distributed Snapshot correctly passes also the third input argument "columnList". """ # A simple dataframe with ten sequential numbers from 0 to 9 df = Spark.RDataFrame(10, sparkcontext=connection)\ .Define("a", "rdfentry_")\ .Define("b", "rdfentry_")\ .Define("c", "rdfentry_")\ .Define("d", "rdfentry_") expectedcolumns = ["a", "b"] df.Snapshot("snapTree_columnlist", "distrdf_spark_snapfile_columnlist.root", expectedcolumns) # Create a traditional RDF from the snapshotted files to retrieve the # list of columns tmp_files = [ "distrdf_spark_snapfile_columnlist_0.root", "distrdf_spark_snapfile_columnlist_1.root" ] rdf = ROOT.RDataFrame("snapTree_columnlist", tmp_files) snapcolumns = [str(column) for column in rdf.GetColumnNames()] assert snapcolumns == expectedcolumns for filename in tmp_files: os.remove(filename)
def build_pyrdf_graph(self): """ Create a DistRDF graph with a fixed set of operations and return it. """ treename = "data" files = [ "http://root.cern/files/teaching/CMS_Open_Dataset.root", ] rdf = Spark.RDataFrame(treename, files, npartitions=5) # Define the analysis cuts chargeCutStr = "C1 != C2" etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3" ptCutStr = "pt1 > 2 && pt2 > 2" rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \ .Filter(etaCutStr, "Central Muons") \ .Filter(ptCutStr, "Sane Pt") # Create the invariant mass column invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +" "pow(py1+py2, 2) + pow(pz1+pz2, 2)))") rdf_fd = rdf_f.Define("invMass", invMassFormulaStr) # Create the histograms pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1") pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2") model = ("invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5, 110) invMass_h = rdf_fd.Histo1D(model, "invMass") pi = ROOT.TMath.Pi() model = ("", "", 64, -pi, pi, 64, -pi, pi) phis_h = rdf_fd.Histo2D(model, "phi1", "phi2") return pt1_h, pt2_h, invMass_h, phis_h
def test_distributed_sum(self, connection): """Test support for `Sum` operation in distributed backend""" rdf_py = Spark.RDataFrame(10, sparkcontext=connection) rdf_def = rdf_py.Define("x", "rdfentry_") rdf_sum = rdf_def.Sum("x") assert rdf_sum.GetValue() == 45.0
def _includes_function_with_filter_and_histo(self, connection): """ Check that the filter operation is able to use C++ functions that were included using header files. """ rdf = Spark.RDataFrame(10, sparkcontext=connection) rdf._headnode.backend.distribute_headers("../test_headers/header1.hxx") # This filters out all numbers less than 5 rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)") histo = rdf_filtered.Histo1D("tdfentry_") # The expected results after filtering # The actual set of numbers required after filtering required_numbers = range(5) required_size = len(required_numbers) required_mean = sum(required_numbers) / float(required_size) required_stdDev = math.sqrt( sum((x - required_mean)**2 for x in required_numbers) / required_size) # Compare the sizes of equivalent set of numbers assert histo.GetEntries() == required_size # Compare the means of equivalent set of numbers assert histo.GetMean() == required_mean # Compare the standard deviations of equivalent set of numbers assert histo.GetStdDev() == required_stdDev
def test_distributed_sum(self): """Test support for `Sum` operation in distributed backend""" rdf_py = Spark.RDataFrame(10) rdf_def = rdf_py.Define("x", "rdfentry_") rdf_sum = rdf_def.Sum("x") self.assertAlmostEqual(rdf_sum.GetValue(), 45.0)
def test_definepersample_simple(self, connection): """ Test DefinePerSample operation on three samples using a predefined string of operations. """ df = Spark.RDataFrame(self.maintreename, self.filenames, sparkcontext=connection) # Associate a number to each sample definepersample_code = """ if(rdfsampleinfo_.Contains(\"{}\")) return 1; else if (rdfsampleinfo_.Contains(\"{}\")) return 2; else if (rdfsampleinfo_.Contains(\"{}\")) return 3; else return 0; """.format(*self.samples) df1 = df.DefinePerSample("sampleid", definepersample_code) # Filter by the sample number. Each filtered dataframe should contain # 10 entries, equal to the number of entries per sample samplescounts = [ df1.Filter("sampleid == {}".format(id)).Count() for id in [1, 2, 3] ] for count in samplescounts: assert count.GetValue() == 10
def test_distributed_asnumpy(self): """Test support for `AsNumpy` pythonization in distributed backend""" # Let's create a simple dataframe with ten rows and two columns df = Spark.RDataFrame(10).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") # Build a dictionary of numpy arrays. npy = df.AsNumpy() self.assertIsInstance(npy, dict) # Retrieve the two numpy arrays with the column names of the original # RDataFrame as dictionary keys. npy_x = npy["x"] npy_y = npy["y"] self.assertIsInstance(npy_x, numpy.ndarray) self.assertIsInstance(npy_y, numpy.ndarray) # Check the two arrays are of the same length as the original columns. self.assertEqual(len(npy_x), 10) self.assertEqual(len(npy_y), 10) # Check the types correspond to the ones of the original columns. int_32_dtype = numpy.dtype("int32") float_32_dtype = numpy.dtype("float32") self.assertEqual(npy_x.dtype, int_32_dtype) self.assertEqual(npy_y.dtype, float_32_dtype)
def test_write_histo(self): """ Tests that an histogram is correctly written to a .root file created before the execution of the event loop. """ self.create_tree_with_data() # Create a new file where the histogram will be written outfile = ROOT.TFile("out_file.root", "recreate") # Create a DistRDF RDataFrame with the parent and the friend trees df = Spark.RDataFrame("Events", "tree_gaus.root") # Create histogram histo = df.Histo1D(("x", "x", 100, 0, 20), "x") # Write histogram to out_file.root and close the file histo.Write() outfile.Close() # Reopen file to check that histogram was correctly stored reopen_file = ROOT.TFile("out_file.root", "read") reopen_histo = reopen_file.Get("x") # Check histogram statistics self.assertEqual(reopen_histo.GetEntries(), self.nentries) self.assertAlmostEqual(reopen_histo.GetMean(), self.gaus_mean, delta=self.delta_equal) self.assertAlmostEqual(reopen_histo.GetStdDev(), self.gaus_stdev, delta=self.delta_equal) # Remove unnecessary .root files os.remove("tree_gaus.root") os.remove("out_file.root")
def test_distributed_snapshot(self): """Test support for `Snapshot` in distributed backend""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Spark.RDataFrame(10).Define("x", "rdfentry_") # Count rows in the dataframe nrows = df.Count() # Snapshot to two files, build a ROOT.TChain with them and retrieve a # Spark.RDataFrame snapdf = df.Snapshot("snapTree", "snapFile.root") # Count the rows in the snapshotted dataframe snapcount = snapdf.Count() self.assertEqual(nrows.GetValue(), 10) self.assertEqual(snapcount.GetValue(), 10) # Retrieve list of file from the snapshotted dataframe input_files = snapdf.proxied_node.get_inputfiles() # Create list of supposed filenames for the intermediary files tmp_files = ["snapFile_0_4.root", "snapFile_5_9.root"] # Check that the two lists are the same self.assertListEqual(input_files, tmp_files) # Check that the intermediary .root files were created with the right # names, then remove them because they are not necessary for filename in tmp_files: self.assertTrue(os.path.exists(filename)) os.remove(filename)
def test_distributed_asnumpy(self, connection): """Test support for `AsNumpy` pythonization in distributed backend""" # Let's create a simple dataframe with ten rows and two columns df = Spark.RDataFrame(10, sparkcontext=connection).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") # Build a dictionary of numpy arrays. npy = df.AsNumpy() self.check_npy_dict(npy)
def test_distributed_snapshot(self, connection): """Test support for `Snapshot` in distributed backend""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Spark.RDataFrame(10, sparkcontext=connection).Define( "x", "rdfentry_") # Snapshot to two files, build a ROOT.TChain with them and retrieve a # Spark.RDataFrame snapdf = df.Snapshot("snapTree", "snapFile.root") self.check_snapshot_df(snapdf, "snapFile")
def test_user_supplied_npartitions_have_precedence(self, connection): """ The SparkContext of this class has 2 cores available. The `SparkBackend.optimize_npartitions` method would return 2. Check that if the user specifies a number of partitions, this is not overwritten by the backend. """ df = Spark.RDataFrame(100, sparkcontext=connection, npartitions=4) # The number of partitions was supplied by the user. assert df._headnode.npartitions == 4
def test_histo1d_merge(self): """Check the working of Histo1D merge operation in the reducer.""" # Operations with DistRDF rdf_py = Spark.RDataFrame(10) histo_py = rdf_py.Histo1D("rdfentry_") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) histo_cpp = rdf_cpp.Histo1D("rdfentry_") # Compare the 2 histograms self.assertHistoOrProfile(histo_py, histo_cpp)
def test_varyfiltersum(self, connection): df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1") df_sum = df.Vary("x", "ROOT::RVecI{-1*x, 2*x}", ("down", "up"), "myvariation").Filter("x > 0").Sum("x") assert df_sum.GetValue() == 10 sums = DistRDF.VariationsFor(df_sum) expectednames = ["nominal", "myvariation:down", "myvariation:up"] expectedsums = [10, 0, 20] for varname, val in zip(expectednames, expectedsums): assert sums[varname] == val
def test_histo(self, connection): df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1") df1 = df.Vary("x", "ROOT::RVecI{-2,2}", ["down", "up"]) h = df1.Histo1D("x") histos = DistRDF.VariationsFor(h) expectednames = ["nominal", "x:up", "x:down"] expectedmeans = [1, 2, -2] for varname, mean in zip(expectednames, expectedmeans): histo = histos[varname] assert isinstance(histo, ROOT.TH1D) assert histo.GetEntries() == 10 assert histo.GetMean() == mean
def test_mixed(self, connection): df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1").Define("y", "42") h = df.Vary("x", "ROOT::RVecI{-1, 2}", variationTags=["down", "up"]).Histo1D("x", "y") histos = DistRDF.VariationsFor(h) expectednames = ["nominal", "x:down", "x:up"] expectedmeans = [1, -1, 2] expectedmax = 420 for varname, mean in zip(expectednames, expectedmeans): histo = histos[varname] assert isinstance(histo, ROOT.TH1D) assert histo.GetMaximum() == expectedmax assert histo.GetMean() == mean
def test_graph(self, connection): df = Spark.RDataFrame(10, sparkcontext=connection, npartitions=2).Define("x", "1") g = df.Vary("x", "ROOT::RVecI{-1, 2}", nVariations=2).Graph("x", "x") gs = DistRDF.VariationsFor(g) assert g.GetMean() == 1 expectednames = ["nominal", "x:0", "x:1"] expectedmeans = [1, -1, 2] for varname, mean in zip(expectednames, expectedmeans): graph = gs[varname] assert isinstance(graph, ROOT.TGraph) assert graph.GetMean() == mean
def test_histo_from_empty_root_file(self): """ Check that when performing operations with the distributed backend on an RDataFrame without entries, DistRDF raises an error. """ # Create an RDataFrame from a file with an empty tree rdf = Spark.RDataFrame("NOMINAL", "emptytree.root") histo = rdf.Histo1D(("empty", "empty", 10, 0, 10), "mybranch") # Get entries in the histogram, raises error with self.assertRaises(RuntimeError): histo.GetEntries()
def test_user_supplied_npartitions_have_precedence(self): """ Check that even if spark configuration options could optimize the number of partitions, a user supplied value for npartitions takes precedence. """ conf = {"spark.executor.cores": 4, "spark.executor.instances": 4} sconf = pyspark.SparkConf().setAll(conf.items()) scontext = pyspark.SparkContext(conf=sconf) df = Spark.RDataFrame(100, sparkcontext=scontext, npartitions=4) # The number of partitions was supplied by the user. self.assertEqual(df._headnode.npartitions, 4)
def test_count_with_same_tree_repeated(self, connection): """ Count entries of a dataset with three times the same tree. """ df = ROOT.RDataFrame(100).Define("x", "1") treename = "tree" filename = "distrdf_roottest_spark_check_backend_same_tree.root" filenames = [filename] * 3 df.Snapshot(treename, filename, ["x"]) rdf = Spark.RDataFrame(treename, filenames, sparkcontext=connection) assert rdf.Count().GetValue() == 300 os.remove(filename)
def test_profile1d_merge(self): """Check the working of Profile1D merge operation in the reducer.""" # Operations with DistRDF rdf_py = Spark.RDataFrame(10) columns_py = self.define_two_columns(rdf_py) profile_py = columns_py.Profile1D(("", "", 64, -4, 4), "x", "y") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) columns_cpp = self.define_two_columns(rdf_cpp) profile_cpp = columns_cpp.Profile1D(("", "", 64, -4, 4), "x", "y") # Compare the 2 profiles self.assertHistoOrProfile(profile_py, profile_cpp)
def test_redefine_one_column(self, connection): """Test that values of one column can be properly redefined.""" # A simple dataframe with ten sequential numbers from 0 to 9 df = Spark.RDataFrame(10, sparkcontext=connection) df_before = df.Define("x", "1") df_after = df_before.Redefine("x", "2") # Initial sum should be equal to 10 sum_before = df_before.Sum("x") # Sum after the redefinition should be equal to 20 sum_after = df_after.Sum("x") assert sum_before.GetValue() == 10.0 assert sum_after.GetValue() == 20.0
def test_friends_tchain_noname_add_fullpath_addfriend_alias(self): """Test against the reproducer of issue https://github.com/root-project/root/issues/7584""" rn1 = "rn1.root" rn2 = "rn2.root" friendsfilename = "friendtrees_spark.root" df_1 = ROOT.RDataFrame(10000) df_2 = ROOT.RDataFrame(10000) df_1 = df_1.Define("rnd", "gRandom->Gaus(10)") df_2 = df_2.Define("rnd", "gRandom->Gaus(20)") df_1.Snapshot("randomNumbers", rn1) df_2.Snapshot("randomNumbersBis", rn2) # Put the two trees together in a common file subprocess.run("hadd -f {} {} {}".format(friendsfilename, rn1, rn2), shell=True, check=True) # Test the specific case of a parent chain and friend chain with no # names, that receive one tree each in the form "filename/treename". The # friend is then added to the parent with an alias. chain = ROOT.TChain() chainFriend = ROOT.TChain() chain.Add("friendtrees_spark.root/randomNumbers") chainFriend.Add("friendtrees_spark.root/randomNumbersBis") chain.AddFriend(chainFriend, "myfriend") df = Spark.RDataFrame(chain) h_parent = df.Histo1D("rnd") h_friend = df.Histo1D("myfriend.rnd") self.assertEqual(h_parent.GetEntries(), 10000) self.assertEqual(h_friend.GetEntries(), 10000) self.assertAlmostEqual(h_parent.GetMean(), 10, delta=0.01) self.assertAlmostEqual(h_friend.GetMean(), 20, delta=0.01) self.assertAlmostEqual(h_parent.GetStdDev(), 1, delta=0.01) self.assertAlmostEqual(h_friend.GetStdDev(), 1, delta=0.01) os.remove(rn1) os.remove(rn2) os.remove(friendsfilename)
def test_rungraphs_sparkanddask_3histos(self, connection): """ Submit three different RDF graphs concurrently to Spark and Dask """ daskconn, sparkconn = connection # Create a test file for processing treename = "myTree" filename = "2clusters.root" nentries = 10000 opts = ROOT.RDF.RSnapshotOptions() opts.fAutoFlush = 5000 ROOT.RDataFrame(nentries).Define("b1", "42")\ .Define("b2", "42")\ .Define("b3", "42")\ .Snapshot(treename, filename, ["b1", "b2", "b3"], opts) histoproxies_spark = [ Spark.RDataFrame(treename, filename, sparkcontext=sparkconn, npartitions=2).Histo1D((col, col, 1, 40, 45), col) for col in ["b1", "b2", "b3"] ] histoproxies_dask = [ Dask.RDataFrame(treename, filename, daskclient=daskconn, npartitions=2).Histo1D((col, col, 1, 40, 45), col) for col in ["b1", "b2", "b3"] ] histoproxies = histoproxies_spark + histoproxies_dask # Before triggering the computation graphs values are None for proxy in histoproxies: assert proxy.proxied_node.value is None DistRDF.RunGraphs(histoproxies) # After RunGraphs all histograms are correctly assigned to the # node objects for proxy in histoproxies: histo = proxy.proxied_node.value assert isinstance(histo, ROOT.TH1D) assert histo.GetEntries() == nentries assert histo.GetMean() == 42 os.remove(filename)
def test_histo3d_merge(self): """Check the working of Histo3D merge operation in the reducer.""" modelTH3D = ("", "", 64, -4, 4, 64, -4, 4, 64, -4, 4) # Operations with DistRDF rdf_py = Spark.RDataFrame(10) columns_py = self.define_three_columns(rdf_py) histo_py = columns_py.Histo3D(modelTH3D, "x", "y", "z") # Operations with PyROOT rdf_cpp = ROOT.ROOT.RDataFrame(10) columns_cpp = self.define_three_columns(rdf_cpp) histo_cpp = columns_cpp.Histo3D(modelTH3D, "x", "y", "z") # Compare the 2 histograms self.assertHistoOrProfile(histo_py, histo_cpp)
def test_distributed_asnumpy_lazy(self, connection): """Test that `AsNumpy` can be still called lazily in distributed mode""" # Let's create a simple dataframe with ten rows and two columns df = Spark.RDataFrame(10, sparkcontext=connection).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") npy_lazy = df.AsNumpy(lazy=True) # The event loop hasn't been triggered yet assert isinstance(npy_lazy, ActionProxy) assert npy_lazy.proxied_node.value is None # Trigger the computations and check final results npy = npy_lazy.GetValue() self.check_npy_dict(npy)