def test_list_trees(): # TTree trees = rnp.list_trees(load('vary1.root')) assert_equal(trees, ['tree']) # TNtuple trees = rnp.list_trees(load('ntuple.root')) assert_equal(trees, ['ntuple'])
def get_any_tree(tfilepath): trees = list_trees(tfilepath) #returns list of tree-names if len(trees) == 1: tree_name = trees[0] else: raise ValueError('More/less than one tree found in {}\nPossible trees: {}'.format(tfilepath, trees)) return tree_name
def _run_tmva_training(self, info): """ Run subprocess to train tmva factory :param info: class with additional information """ tmva_process = subprocess.Popen( 'cd {directory}; {executable} -c "from rep.estimators import _tmvaFactory; _tmvaFactory.main()"'.format( directory=info.directory, executable=sys.executable), stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT, shell=True) cPickle.dump(self, tmva_process.stdin) cPickle.dump(info, tmva_process.stdin) stdout, stderr = tmva_process.communicate() assert tmva_process.returncode == 0, \ 'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout) assert 'TrainTree' in root_numpy.list_trees(os.path.join(info.directory, info.tmva_root)), \ 'ERROR: Result file has not TrainTree' xml_filename = os.path.join(info.directory, 'weights', '{job}_{name}.weights.xml'.format(job=info.tmva_job, name=self._method_name)) with open(xml_filename, 'r') as xml_file: self.formula_xml = xml_file.read()
def _run_tmva_training(self, info): """ Run subprocess to train tmva factory :param info: class with additional information """ tmva_process = subprocess.Popen( 'cd {directory}; {executable} -c "from rep.estimators import _tmvaFactory; _tmvaFactory.main()"' .format(directory=info.directory, executable=sys.executable), stdin=PIPE, stdout=PIPE, stderr=subprocess.STDOUT, shell=True) cPickle.dump(self, tmva_process.stdin) cPickle.dump(info, tmva_process.stdin) stdout, stderr = tmva_process.communicate() assert tmva_process.returncode == 0, \ 'ERROR: TMVA process is incorrect finished \n LOG: %s \n %s' % (stderr, stdout) assert 'TrainTree' in root_numpy.list_trees(os.path.join(info.directory, info.tmva_root)), \ 'ERROR: Result file has not TrainTree' xml_filename = os.path.join( info.directory, 'weights', '{job}_{name}.weights.xml'.format(job=info.tmva_job, name=self._method_name)) with open(xml_filename, 'r') as xml_file: self.formula_xml = xml_file.read()
def get_any_tree(tfilepath): trees = rn.list_trees(tfilepath) #returns list of tree-names if len(trees) == 1: tree_name = trees[0] else: raise ValueError('More than one tree found in {}'.format(tfilepath)) return tree_name
def from_file(files, **options): """ Load a dataset from a file or collection of files. files: string file name, glob pattern or iterable of string file names. options: treename: str, name of the TTree object in the collection of files. Optional only if all input files contain a single TTree, with the same name. """ if isinstance(files, str): files = glob.glob(files) treename = options.get("treename") from root_numpy import list_trees #TODO: avoid the dependency if treename is None: notOK = [] treenames = set() for fname in files: #TODO: check the case when there are no trees in a file! trees = list_trees(fname) if len(trees) != 1: notOK.append(fname) treenames.add(trees[0]) if notOK: raise ValueError( "Multiple trees found in file(s) {0}!\nPlease specify a tree name." .format(','.join(notOK))) if len(treenames) > 1: raise ValueError( "Different tree names found!\nPlease specify the desired tree name." ) treename = treenames.pop() del notOK, treenames else: notOK = [] for fname in files: #TODO: check the case when there are no trees in a file! trees = list_trees(fname) if trees[0] != treename: notOK.append(fname) if notOK: raise ValueError( "Specified tree name '{0}'' not found in file(s) {1}!\nPlease check your inputs." .format(treename, ','.join(notOK))) chain = ROOT.TChain(treename) for name in list(files): n = chain.Add(name) return ROOTDataset( chain, FileOrigin([f.GetTitle() for f in chain.GetListOfFiles()]))
def table_from_root(source, treename=None, include_names=None, **kwargs): """Read a Table from a ROOT tree """ import root_numpy if include_names is None: try: include_names = kwargs.pop('columns') except KeyError: pass else: warnings.warn("Keyword argument `columns` has been renamed to " "`include_names` to better match default " "astropy.table.Table.read kwargs, please update " "your call.", DeprecationWarning) # parse column filters into tree2array ``selection`` keyword # NOTE: not all filters can be passed directly to root_numpy, so we store # those separately and apply them after-the-fact before returning try: selection = kwargs.pop('selection') except KeyError: # no filters filters = None else: rootfilters = [] filters = [] for col, op_, value in parse_column_filters(selection): try: opstr = [key for key in OPERATORS if OPERATORS[key] is op_][0] except (IndexError, KeyError): # cannot filter with root_numpy filters.append((col, op_, value)) else: # can filter with root_numpy rootfilters.append('{0} {1} {2!r}'.format(col, opstr, value)) kwargs['selection'] = ' && '.join(rootfilters) # pass file name (not path) if not isinstance(source, string_types): source = source.name # find single tree (if only one tree present) if treename is None: trees = root_numpy.list_trees(source) if len(trees) == 1: treename = trees[0] elif not trees: raise ValueError("No trees found in %s" % source) else: raise ValueError("Multiple trees found in %s, please select on " "via the `treename` keyword argument, e.g. " "`treename='events'`. Available trees are: %s." % (source, ', '.join(map(repr, trees)))) # read, filter, and return t = Table(root_numpy.root2array(source, treename, branches=include_names, **kwargs)) if filters: return filter_table(t, *filters) return t
def find_tree(tree_to_look_for, filename): """ Find argument tree name that contains Track Information """ trees = rn.list_trees(filename) try: track_tree = next(tree for tree in trees if tree_to_look_for.lower() in tree.lower()) except StopIteration: raise ValueError("No tree with name \"" + tree_to_look_for + "\" found in file") return track_tree
def Read_Root_file(Path_to_tree, Tree_name_array, Branches, Tree_selection=None): """Transform .root file with branches to ndarray (similar as a dictionary). Returns an array of arrays where each array will be a branch This function use root2array wich is a root_numpy's function. Parameters: Path_to_tree -- The complete path to the folder of the .root files Tree_name_array -- The name of each tree inside the folder without .root extension. Only the names in an array. For example: ['bla1', 'bla2'] Branches -- The branches can be selected one by one in array. For example: ['Brho' , 'D'] Selection -- A prelimanar selection can be apply. For example: 'ThetaLdeg >> 0.0 & D == 0.0' Excepctions: None Return: array_data -- A matrix with the name of the branches in arrays related with the data of each initial branch in the original .root file """ path_filename = list( map(lambda s: os.path.join(Path_to_tree, s) + '.root', Tree_name_array)) #The file path path_treename = rn.list_trees(path_filename[0]) #The name inside the tree print('We are reading this root files:') print(Tree_name_array) if Branches == 'All': Tree_branches = rn.list_branches( path_filename[0] ) #Asumption with all the trees have the same name for their branches print('All branches are chosen') else: Tree_branches = Branches print('These branches ') print(Branches) print('are selected') if Tree_selection != None: print('The pre-selection over trees is ', Tree_selection) array_data = rn.root2array(filenames=path_filename, treename=path_treename[0], branches=Tree_branches, selection=Tree_selection) else: print('No pre-selection applied') array_data = rn.root2array(filenames=path_filename, treename=path_treename[0], branches=Tree_branches) return array_data
def test_list_trees(): # TTree trees = rnp.list_trees(load('vary1.root')) assert_equal(trees, ['tree']) # TNtuple trees = rnp.list_trees(load('ntuple.root')) assert_equal(trees, ['ntuple']) # Multiple key cycles of the same tree with temp() as rfile: tree = ROOT.TTree('tree', 'tree') rfile.Write() assert_equal(len(rnp.list_trees(rfile.GetName())), 1) rfile.Write() assert_equal(len(rnp.list_trees(rfile.GetName())), 1) rdir = rfile.mkdir('dir') rdir.cd() tree = ROOT.TTree('tree', 'tree') rfile.Write() assert_equal(set(rnp.list_trees(rfile.GetName())), set(['tree', 'dir/tree']))
def check_truncate_impute(filename): filename = load(filename) # first convert array and find object columns arr = rnp.root2array(filename) assert_true(len(arr)) object_fields = [ field for field in arr.dtype.names if arr.dtype[field] == 'O' ] fields_1d = [ field for field in object_fields if arr[field][0].dtype != 'O' and len(arr[field][0].shape) == 1 ] fields_md = list(set(object_fields) - set(fields_1d)) assert_true(fields_1d) assert_true(fields_md) fields_1d.sort() fields_md.sort() rfile = ROOT.TFile.Open(filename) tree = rfile.Get(rnp.list_trees(filename)[0]) # test both root2array and tree2array for func, arg in [(rnp.root2array, filename), (rnp.tree2array, tree)]: arr1 = func(arg, branches=[(f, 0) for f in fields_1d]) assert_true(len(arr1)) assert_equal(set(arr1.dtype.names), set(fields_1d)) # Giving length of 1 will result in the same output arr2 = func(arg, branches=[(f, 0, 1) for f in fields_1d]) assert_array_equal(arr1, arr2) # fill_value of 1 instead of 0 should change output array arr2 = func(arg, branches=[(f, 1, 1) for f in fields_1d]) assert_raises(AssertionError, assert_array_equal, arr1, arr2) # check dtype shape arr3 = func(arg, branches=[(f, 0, 3) for f in fields_1d]) for field in fields_1d: assert_equal(arr3.dtype[field].shape, (3, )) # length must be at least 1 assert_raises(ValueError, func, arg, branches=[(fields_1d[0], 0, 0)]) # tuple is not of length 2 or 3 assert_raises(ValueError, func, arg, branches=[(fields_1d[0], 1, 1, 1)]) assert_raises(ValueError, func, arg, branches=(fields_1d[0], 1, 1, 1)) # can only truncate 1d arrays assert_raises(TypeError, func, arg, branches=(fields_md[0], 0)) # expressions arr1 = func(arg, branches='{0}==0'.format(fields_1d[0])) assert_equal(arr1.dtype, 'O') arr2 = func(arg, branches=('{0}==0'.format(fields_1d[0]), 0)) assert_equal(arr2.dtype, arr1[0].dtype)
def export_root_data_contents(self, rfile): """ Uses the root_numpy class to list trees and output them. :param rfile: Input ROOT file :return: a dictionary defining the underlying data in the file link. The key is the name of each tree. """ for tree in list_trees(rfile): print 'Processing tree ' + tree print list_branches(rfile, tree) arr = root2array(rfile, treename=tree) print str(arr.view(numpy.recarray))
def Branches_to_Arrays(Path_to_tree, Tree_names, Tree_name_inside, Branches, Tree_selection = None): """Transform .root file with branches to array of arrays (similar as a dictionary). Returns an array of arrays where each array will be a branch This function use root2array wich is a root_numpy's function. Parameters: Path_to_tree -- The complete path to the folder of the .root files Tree_names -- The name of each tree inside the folder without .root extension. Only the names in an array. For example: ['bla1', 'bla2'] Tree_name_inside -- In general, all the root files have equal tree name inside, ex: 'GM', 'AD' Branches -- The branches can be selected one by one in array. For example: ['Brho' , 'D'] or 'All' if one wants all the branches Selection -- A prelimanar selection can be apply. For example: 'ThetaLdeg >> 0.0 & D == 0.0' Excepctions: None Return: array_data -- A matrix with the name of the branches in arrays related with the data of each initial branch in the original .root file """ print('You are working with this samples: ') print(str(Tree_names)) print('\n') path_filename = list(map(lambda s: os.path.join(Path_to_tree, s) + '.root', Tree_names)) #The file path path_treename = rn.list_trees(path_filename[0]) #The names inside the root file (is an array) print('The trees inside are: ', path_treename) print('\n') if path_treename[0] != Tree_name_inside: #by default is taken the first print('Wrong tree election!') print('\n') if Branches == 'All': #Tree_branches = rn.list_branches(path_treename[0]) #Asumption with all the trees have the same name for their branches Tree_branches = None #by default that means take all in root_numpy print('All branches are chosen') print('\n') else: Tree_branches = Branches print('This branches ') print(Branches) print('are selected') print('\n') if Tree_selection != None: print('The pre-selection over trees is ', Tree_selection) array_data = rn.root2array(filenames = path_filename, treename = path_treename[0], branches = Tree_branches, selection = Tree_selection) else: print('No pre-selection applied') array_data = rn.root2array(filenames = path_filename, treename = path_treename[0], branches = Tree_branches) return array_data
def load_root_tree(files, tree=None, columns=None, ignore=None, *kargs, **kwargs): from pandas import DataFrame from root_numpy import root2array, list_trees, list_branches # check if we get a list or a single file if not isinstance(files, list): files = [files] # use the first file to define tree & branches init_file = files[0] # check to see if there is a specified tree, if not, # look for a single tree. If the choice is ambiguous, # raise an error and exit if tree == None: trees = list_trees(init_file) if len(trees) == 1: tree = trees[0] elif len(trees) == 0: raise ValueError('Error: no trees found in {}'.format(init_file)) else: raise ValueError( 'Ambiguous call: more than one tree found in {}'.format( init_file)) branches = list_branches(init_file, tree) # match existing branches to branches asked for by user if not columns: all_vars = branches else: all_vars = get_matching_variables(branches, columns) # handle branches that are asked to be ignored if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(ignore, branches) for var in ignored: all_vars.remove(var) arr = root2array(files, tree, all_vars, *kargs, **kwargs) if 'index' in arr.dtype.names: df = DataFrame.from_records(arr, index='index') else: df = DataFrame.from_records(arr) return df
def table_from_root(source, treename=None, columns=None, **kwargs): """Read a Table from a ROOT tree """ import root_numpy # parse column filters into tree2array ``selection`` keyword # NOTE: not all filters can be passed directly to root_numpy, so we store # those separately and apply them after-the-fact before returning try: selection = kwargs.pop('selection') except KeyError: # no filters filters = None else: rootfilters = [] filters = [] for col, op_, value in parse_column_filters(selection): try: opstr = [key for key in OPERATORS if OPERATORS[key] is op_][0] except (IndexError, KeyError): # cannot filter with root_numpy filters.append((col, op_, value)) else: # can filter with root_numpy rootfilters.append('{0} {1} {2!r}'.format(col, opstr, value)) kwargs['selection'] = ' && '.join(rootfilters) # pass file name (not path) if not isinstance(source, string_types): source = source.name # find single tree (if only one tree present) if treename is None: trees = root_numpy.list_trees(source) if len(trees) == 1: treename = trees[0] elif not trees: raise ValueError("No trees found in %s" % source) else: raise ValueError("Multiple trees found in %s, please select on " "via the `treename` keyword argument, e.g. " "`treename='events'`. Available trees are: %s." % (source, ', '.join(map(repr, trees)))) # read, filter, and return t = Table(root_numpy.root2array( source, treename, branches=columns, **kwargs )) if filters: return filter_table(t, *filters) return t
def read_trees(input_dir, selection): dfs = [] root_files = glob.iglob(input_dir + r"*.root") for root_file in root_files: for tree in list_trees(root_file): # Parse tree name split_tree = tree.split("__") process = split_tree[0].split("Ttree_", 1)[-1] if process not in process_groups.keys(): continue try: systematic = "__" + split_tree[1] try: pm = split_tree[2] if pm == "plus": systematic = systematic + "Up" elif pm == "minus": systematic = systematic + "Down" else: continue except IndexError: pass except IndexError: systematic = "" with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) df = read_tree(root_file, tree, where=selection) if df.empty: print(process + systematic, "skipped (empty)") continue print(process + systematic) # Label $PROCESS__$SYSTEMATIC df = df.assign(Group=process_groups[process] + systematic) df = df.assign(Category=process + systematic) dfs.append(df) df = pd.concat(dfs) df.Category = df.Category.astype('category') df.Group = df.Group.astype('category') return df.reset_index(drop=True)
def export_root_to_csv(filename, branches=None): """From selected file exports all the trees in separate files, exports all the branches, requires rootpy and root_numpy modules""" import root_numpy import os trees = root_numpy.list_trees(filename) print("The following branches are found:\n %s" % trees) result = [] for tree_name in trees: x = root_numpy.root2array(filename, treename=tree_name, branches=branches) new_file_name = os.path.splitext(filename)[0] + '_' + tree_name + '.csv' pandas.DataFrame(x).to_csv(new_file_name) result.append(new_file_name) print("Successfully converted") return result
def check_truncate_impute(filename): filename = load(filename) # first convert array and find object columns arr = rnp.root2array(filename) assert_true(len(arr)) object_fields = [field for field in arr.dtype.names if arr.dtype[field] == 'O'] fields_1d = [field for field in object_fields if arr[field][0].dtype != 'O' and len(arr[field][0].shape) == 1] fields_md = list(set(object_fields) - set(fields_1d)) assert_true(fields_1d) assert_true(fields_md) fields_1d.sort() fields_md.sort() rfile = ROOT.TFile.Open(filename) tree = rfile.Get(rnp.list_trees(filename)[0]) # test both root2array and tree2array for func, arg in [(rnp.root2array, filename), (rnp.tree2array, tree)]: arr1 = func(arg, branches=[(f, 0) for f in fields_1d]) assert_true(len(arr1)) assert_equal(set(arr1.dtype.names), set(fields_1d)) # Giving length of 1 will result in the same output arr2 = func(arg, branches=[(f, 0, 1) for f in fields_1d]) assert_array_equal(arr1, arr2) # fill_value of 1 instead of 0 should change output array arr2 = func(arg, branches=[(f, 1, 1) for f in fields_1d]) assert_raises(AssertionError, assert_array_equal, arr1, arr2) # check dtype shape arr3 = func(arg, branches=[(f, 0, 3) for f in fields_1d]) for field in fields_1d: assert_equal(arr3.dtype[field].shape, (3,)) # length must be at least 1 assert_raises(ValueError, func, arg, branches=[(fields_1d[0], 0, 0)]) # tuple is not of length 2 or 3 assert_raises(ValueError, func, arg, branches=[(fields_1d[0], 1, 1, 1)]) assert_raises(ValueError, func, arg, branches=(fields_1d[0], 1, 1, 1)) # can only truncate 1d arrays assert_raises(TypeError, func, arg, branches=(fields_md[0], 0)) # expressions arr1 = func(arg, branches='{0}==0'.format(fields_1d[0])) assert_equal(arr1.dtype, 'O') arr2 = func(arg, branches=('{0}==0'.format(fields_1d[0]), 0)) assert_equal(arr2.dtype, arr1[0].dtype)
def run(input_fns, output_fn, h1, h2, h3): keys = list_trees(input_fns[0]) assert len(keys) == 1, keys df = read_root(input_fns, keys[0]) df['H1_isMuon'] = df['H1_isMuon'].astype(np.bool) df['H2_isMuon'] = df['H2_isMuon'].astype(np.bool) df['H3_isMuon'] = df['H3_isMuon'].astype(np.bool) # Sort the columns so that the first is the most kaon-like assert sorted([h1, h2, h3 ]) == [h1, h2, h3 ], 'Children are ranked from kaon-like to pion-like' order = np.argsort(df[['H3_ProbK', 'H2_ProbK', 'H1_ProbK']], axis=1) for col in [c for c in df.columns if c.startswith('H1_')]: col = col[len('H1_'):] cols = [f'H1_{col}', f'H2_{col}', f'H3_{col}'] df[cols] = df[cols].values[np.arange(order.shape[0])[:, None], order] # Compute the PE and mass of all particles for head, mass in [('H1', mass_dict[h1]), ('H2', mass_dict[h2]), ('H3', mass_dict[h3])]: df.eval(f'{head}_P = sqrt({head}_PX**2 + {head}_PY**2 + {head}_PZ**2)', inplace=True) df.eval(f'{head}_PE = sqrt({mass}**2 + {head}_P**2)', inplace=True) for component in ['PE', 'PX', 'PY', 'PZ']: df.eval( f'B_{component} = H1_{component} + H2_{component} + H3_{component}', inplace=True) df.eval(f'B_M = sqrt(B_PE**2 - B_PX**2 - B_PY**2 - B_PZ**2)', inplace=True) # if [h1, h2, h3] == ['K', 'K', 'K']: # Apply ignore muons df.query('~(H1_isMuon | H2_isMuon | H3_isMuon)', inplace=True) # Apply an additional selection df.query(f'(H1_IPChi2 > 25) & (H2_IPChi2 > 25) & (H3_IPChi2 > 25)', inplace=True) # Apply a PID selection df.query( f'(H1_Prob{h1} > {pid_cut}) & (H2_Prob{h2} > {pid_cut}) & (H3_Prob{h3} > {pid_cut})', inplace=True) to_root(df, output_fn, key=f'B2{h1}{h2}{h3}', mode='w', store_index=False)
def table_from_root(f, treename=None, include_names=None, **kwargs): import root_numpy if include_names is None: try: include_names = kwargs.pop('columns') except KeyError: pass else: warnings.warn("Keyword argument `columns` has been renamed to " "`include_names` to better match default " "astropy.table.Table.read kwargs, please update " "your call.", DeprecationWarning) # parse column filters into tree2array ``selection`` keyword try: filters = kwargs.pop('selection') except KeyError: pass else: if isinstance(filters, (list, tuple)): filters = ' && '.join(filters) kwargs['selection'] = filters # find single tree (if only one tree present) files = file_list(f) if treename is None: trees = root_numpy.list_trees(files[0]) if len(trees) == 1: treename = trees[0] elif len(trees) == 0: raise ValueError("No trees found in %s" % files[0]) else: raise ValueError("Multiple trees found in %s, please select on " "via the `treename` keyword argument, e.g. " "`treename='events'`. Available trees are: %s." % (files[0], ', '.join(map(repr, trees)))) # read and return return Table(root_numpy.root2array(files, treename, branches=include_names, **kwargs))
def __init__(self, files, chunksize, n_files=100000, key=None, **kwargs): """ Creates a DataChunk object building the sample combining several root files. Arguments files - list of strings List of file names to be read chunksize - int Total number of rows to be picked from the various files n_files - int Maximal number of randomly selected files to pick from in a single chunk. Default: 100000 key - string or None Name of the TTree to be loaded. Can be None (default) is a single TTree is defined per TFile. Other arguments are passed to root_numpy.root2array complementing the arguments: `file`, `treename`, `start`, `stop` defined by DataChunks. """ self._files = list() self._n_files = n_files self._chunksize = chunksize self._ntot = 0 self._kwargs = kwargs for f in files: key_ = key if key else None if not key_: for key_ in rnp.list_trees(f): break root_file = ROOT.TFile.Open(f) if not root_file: raise IOError("File % could not be opened" % f) root_tree = root_file.Get(key_) if not root_file: raise IOError("File % could not be opened" % f) entries = root_tree.GetEntries() self._files.append((f, key_, entries)) self._ntot += entries
def _LoadRoot(filepath): if not useRootNumpy: raise IOError("root_numpy not available - can't load ROOT file") data = BDSAsciiData() trees = _rnp.list_trees(filepath) if 'optics' in trees: branches = _rnp.list_branches(filepath, 'optics') treedata = _rnp.root2array(filepath, 'optics') elif 'orbit' in trees: branches = _rnp.list_branches(filepath, 'orbit') treedata = _rnp.root2array(filepath, 'orbit') else: raise IOError("This file doesn't have the required tree 'optics'.") for element in range(len(treedata[branches[0]])): elementlist = [] for branch in branches: if element == 0: data._AddProperty(branch) elementlist.append(treedata[branch][element]) data.append(elementlist) return data
def read_root(fname, tree_name=None, variables=None, ignore=None, chunksize=None, *kargs, **kwargs): """ Read a ROOT file into a pandas DataFrame. Further *kargs and *kwargs are passed to root_numpy's root2array. If the root file contains a branch called index, it will become the DataFrame's index. Parameters ---------- fname: string The filename of the root file tree_name: string The name of the tree to load variables: sequence A sequence of shell-patterns. Matching variables are read. ignore: sequence A sequence of shell-patterns. All matching variables are ignored (overriding the variables argument) chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with chunksize rows Returns ------- DataFrame from the ROOT file Notes ----- >>> df = read_root('test.root', 'MyTree', variables=['x_*', 'y_*'], selection='x_1 > 100') """ if not tree_name: branches = list_trees(fname) if len(branches) == 1: tree_name = branches[0] else: raise ValueError('More than one tree found in {}'.format(fname)) if not variables: all_vars = None else: # index is always loaded if it exists variables.append('index') all_vars = get_matching_variables(fname, tree_name, variables) if ignore: if not all_vars: all_vars = get_matching_variables(fname, tree_name, ['*']) ignored = get_matching_variables(fname, tree_name, ignore) if 'index' in ignored: raise ValueError('index variable is being ignored!') for var in ignored: all_vars.remove(var) if chunksize: f = ROOT.TFile(fname) n_entries = f.Get(tree_name).GetEntries() f.Close() def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(fname, tree_name, all_vars, start=chunk * chunksize, stop=(chunk + 1) * chunksize, *kargs, **kwargs) yield convert_to_dataframe(arr) return genchunks() arr = root2array(fname, tree_name, all_vars, *kargs, **kwargs) return convert_to_dataframe(arr)
#branch_names = 'absCosThetaStar_CS,absCosTheta_bb,absCosTheta_gg,PhoJetMinDr,customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingJet_DeepCSV,subleadingJet_DeepCSV,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverM,diphotonCandidatePtOverdiHiggsM,dijetCandidatePtOverdiHiggsM,leadingJet_bRegNNResolution,subleadingJet_bRegNNResolution,noexpand:sigmaMJets/Mjj,noexpand:leadingPhoton_pt/CMS_hgg_mass,noexpand:subleadingPhoton_pt/CMS_hgg_mass,noexpand:leadingJet_pt/Mjj,noexpand:subleadingJet_pt/Mjj,PhoJetOtherDr'.split(",") #DeepJet branch_names = 'absCosThetaStar_CS,absCosTheta_bb,absCosTheta_gg,PhoJetMinDr,customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingJet_DeepFlavour,subleadingJet_DeepFlavour,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverM,diphotonCandidatePtOverdiHiggsM,dijetCandidatePtOverdiHiggsM,leadingJet_bRegNNResolution,subleadingJet_bRegNNResolution,sigmaMJets,noexpand:leadingPhoton_pt/CMS_hgg_mass,noexpand:subleadingPhoton_pt/CMS_hgg_mass,noexpand:leadingJet_pt/Mjj,noexpand:subleadingJet_pt/Mjj,PhoJetOtherDr,rho'.split( ",") #DeepJet + Mjj #branch_names = 'absCosThetaStar_CS,absCosTheta_bb,absCosTheta_gg,PhoJetMinDr,customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingJet_DeepFlavour,subleadingJet_DeepFlavour,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverM,diphotonCandidatePtOverdiHiggsM,dijetCandidatePtOverdiHiggsM,leadingJet_bRegNNResolution,subleadingJet_bRegNNResolution,sigmaMJets,Mjj,rho'.split(",") branch_names = [c.strip() for c in branch_names] print branch_names import pandas as pd import root_pandas as rpd from root_numpy import root2array, list_trees for i in range(len(utils.IO.backgroundName)): print list_trees(utils.IO.backgroundName[i]) preprocessing.set_signals_and_backgrounds("bbggSelectionTree", branch_names) X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.set_variables( branch_names) #relative weighting between components of one class is kept, all classes normalized to the same #weights_sig=preprocessing.weight_signal_with_resolution(weights_sig,y_sig) #!!!! reweight 28112019 weights_bkg, weights_sig = preprocessing.normalize_process_weights( weights_bkg, y_bkg, weights_sig, y_sig) X_bkg, y_bkg, weights_bkg = preprocessing.randomize(X_bkg, y_bkg, weights_bkg) X_sig, y_sig, weights_sig = preprocessing.randomize(X_sig, y_sig, weights_sig) print X_bkg.shape print y_bkg.shape
# plt.rcParams.update({'font.size': 18}) parser = argparse.ArgumentParser(prog='./Efficiencies') parser.add_argument('-i', '--input', nargs='+', help='specify input root file') parser.add_argument('-o', '--output', default='./', help='specify output dir') args = parser.parse_args() inputs = args.input output = args.output if not os.path.isdir(output): os.mkdir(output) if isinstance(inputs, (list, )): treeName = list_trees(inputs[0]) else: treeName = list_trees(inputs) inputs = [ inputs, ] if (len(treeName) > 1): print("more then one tree in file ... specify, which tree to use") exit() # acceptance selection selection = 'z_genMass > 56 ' \ '& z_genMass < 116 ' \ '& muon_genPt > 30 ' \ '& antiMuon_genPt > 30 ' \
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches seed_path = paths[0] if not key: trees = list_trees(seed_path) if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError('More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list(filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) def do_flatten(arr, flatten): if flatten is True: warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like " "to flatten in a list: flatten=['foo', 'bar']", FutureWarning) arr_, idx = stretch(arr, return_indices=True) else: nonscalar = get_nonscalar_columns(arr) fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)] will_drop = [x for x in arr.dtype.names if x not in fields] if will_drop: warnings.warn("Ignored the following non-scalar branches: {bad_names}" .format(bad_names=", ".join(will_drop)), UserWarning) arr_, idx = stretch(arr, fields=fields, return_indices=True) arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) return arr if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() # XXX could explicitly clean up the opened TFiles with TChain::Reset def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr) return genchunks() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
import numpy as np import math #from matplotlib import pylab as plt from ROOT import TTree, TFile, TH2D, TCanvas, TH1F, gROOT from root_numpy import array2hist, hist2array, fill_hist, tree2array, root2array, list_trees filename = "../output/ClusterAllOne_main_phs1_gamma.root" inputFile = TFile(filename) tree = inputFile.Get("PhaseSpace") list_trees(inputFile) # Energy = hist2array(inputFile) E_dep = root2array(tree, treename="dX") plt.hist(E_kinetic) plt.pause(0.01) #print(E_kinetic) #np.save('pythonArr', E_kinetic) input("press enter to exit")
def test_list_trees(): trees = rnp.list_trees(load('vary1.root')) assert_equal(trees, ['tree'])
# '17E': glob.glob(storage+"_V09_UL2017E/201214_134634/0000/output_0_*"), # '17F': glob.glob(storage+"_V09_UL2017F/201214_141544/0000/output_0_*"), '17H': glob.glob(storage + "_V11_UL2017H/210420_072656/0000/output_0_*"), # '18A': glob.glob(storage+"_V11_UL2018A/210420_072218/0000/output_0_*"), # '18B': glob.glob(storage+"_V11_UL2018B/210420_072153/0000/output_0_*"), # '18C': glob.glob(storage+"_V11_UL2018C/210420_072235/0000/output_0_*"), # '18D': glob.glob(storage+"_V11_UL2018D/210420_072249/000?/output_0_*"), # # Missing runs from 2018 # '18D': glob.glob(storage+"_V11_UL2018D_v3/210506_184017/0000/output_0_*") } for era, input in inputs.iteritems(): print("##### era {0}".format(era)) treeName = rn.list_trees(input[0])[0] print(">>> Load Events from {0} files".format(len(input))) _df = [] for i in input: print("> file " + i) tfile = ROOT.TFile.Open(i) if (tfile.Get(treeName).GetEntries(selection) == 0): print("> no events in this file found! continue with next file") continue _df.append( tree_to_df( rn.root2array(i, treeName, selection=selection, branches=branches))) print(">>> Concatenate")
if len(options.data) > 2: raise SystemExit("ERROR: To many arguments for the data file with -d. Use: -d filename.root -d tree.") if options.bins == None: logging.info("No binning with -n specified, use the default value 100") bins = 100 else: bins = options.bins #Reference MC referenceMC = options.montecarlo[0] #if only one argument to -m or -d is given it is assumed that there is only one tree if len(options.montecarlo) == 1: trees = list_trees(referenceMC) if len(trees) == 1: referenceMC_tree = trees[0] else: raise SystemExit('No tree or more than one found in ', referenceMC ) else: referenceMC_tree = options.montecarlo[1] #Reference Data referenceData = options.data[0] #if only one argument to -m or -d is given it is assumed that there is only one tree if len(options.data) == 1: trees = list_trees(referenceData) if len(trees) == 1: referenceData_tree = trees[0] else:
def get_info(self): trees = rn.list_trees(self.file) for tree in trees: print(str.capitalize(tree) + ":") print(rn.list_branches(self.file, treename=tree))
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches seed_path = paths[0] if not key: trees = list_trees(seed_path) if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError( 'More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list( filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list( itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list( itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) def do_flatten(arr, flatten): if flatten is True: warnings.warn( " The option flatten=True is deprecated. Please specify the branches you would like " "to flatten in a list: flatten=['foo', 'bar']", FutureWarning) arr_, idx = stretch(arr, return_indices=True) else: nonscalar = get_nonscalar_columns(arr) fields = [ x for x in arr.dtype.names if (x not in nonscalar or x in flatten) ] will_drop = [x for x in arr.dtype.names if x not in fields] if will_drop: warnings.warn( "Ignored the following non-scalar branches: {bad_names}". format(bad_names=", ".join(will_drop)), UserWarning) arr_, idx = stretch(arr, fields=fields, return_indices=True) arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) return arr if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() # XXX could explicitly clean up the opened TFiles with TChain::Reset def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk + 1) * chunksize, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr) return genchunks() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches, ensuring the key exists for seed_path in paths: trees = list_trees(seed_path) if key and key not in trees: continue break else: if key: raise OSError('{} not found in any of the given paths'.format(key)) else: raise OSError('No trees found in any of the given paths') if not key: if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError('More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list(filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() n_chunks = int(ceil(float(n_entries) / chunksize)) # XXX could explicitly clean up the opened TFiles with TChain::Reset class genchunk(object): def __len__(self): return n_chunks def __iter__(self): current_index = 0 for chunk in range(n_chunks): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) if len(arr) == 0: continue if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr, start_index=current_index) current_index += len(arr) return genchunk() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
def read_root(path, tree_key=None, columns=None, ignore=None, chunksize=None, where=None, *kargs, **kwargs): """ Read a ROOT file into a pandas DataFrame. Further *kargs and *kwargs are passed to root_numpy's root2array. If the root file contains a branch called index, it will become the DataFrame's index. Parameters ---------- path: string The path to the root file tree_key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument) chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows where: str Only rows that match the expression will be read Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not tree_key: branches = list_trees(path) if len(branches) == 1: tree_key = branches[0] else: raise ValueError('More than one tree found in {}'.format(path)) branches = list_branches(path, tree_key) if not columns: all_vars = branches else: # index is always loaded if it exists if isinstance(columns, string_types): columns = [columns] if 'index' in branches: columns = columns[:] columns.append('index') columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if 'index' in ignored: raise ValueError('index variable is being ignored!') for var in ignored: all_vars.remove(var) if chunksize: f = ROOT.TFile(path) n_entries = f.Get(tree_key).GetEntries() f.Close() def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(path, tree_key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *kargs, **kwargs) yield convert_to_dataframe(arr) return genchunks() arr = root2array(path, tree_key, all_vars, selection=where, *kargs, **kwargs) return convert_to_dataframe(arr)
from root_numpy import root2array, tree2array import root_numpy import numpy as np import argparse parser = argparse.ArgumentParser( description='Convert ".tree" file to compressed numpy ".npz"') parser.add_argument('--path', metavar='p', type=str, help='Path to ".tree" file') parser.add_argument('--output', metavar='o', type=str, help='Output name') args = parser.parse_args() filename = args.path trees = root_numpy.list_trees(filename) array = [] for i in range(len(trees)): array.append(root2array(filename, trees[i])) np.savez(args.output, array, allow_pickle=True)
import numpy as np import matplotlib matplotlib.use('pdf') import matplotlib.pyplot as plt parser = argparse.ArgumentParser(prog='./ZMuMUEfficiency') parser.add_argument( '-i','--input', required=True, help='specify input root file' ) args = parser.parse_args() input = args.input treeName = list_trees(input) if(len(treeName) > 1): print("more then one tree in file ... specify, which tree to use") exit() #acceptance selection selection='ZStableMass > 66 ' \ '& ZStableMass < 116 ' \ '& (ZDecayMode == 13 | ZDecayMode == 151313) ' \ '& ZLeptonPt > 27 ' \ '& ZAntiLeptonPt > 27 ' \ '& abs(ZLeptonEta) < 2.4 ' \ '& abs(ZAntiLeptonEta) < 2.4 ' #specify which branches to load branches=['MuonProbeCategory','nPV','eventWeight']
data_loaded = yaml.load(stream) channels_models = data_loaded["models"] files = data_loaded["files"] if index_files != None: files = [files[index] for index in index_files] full_output = data_loaded["full output"] output_folder = data_loaded["output_folder"] n_processes = data_loaded["n_processes"] mode = data_loaded["mode"] args = [] managers = {} locks = {} for index, f in enumerate(files): trees = list_trees(f) for tree in trees: if tree in channels_models: model_path = channels_models[tree][1] channel = channels_models[tree][0] else: continue foldername, treename = tree.split("/") output_filename = get_output_filename(f, output_folder) if not output_filename in managers: managers[output_filename] = Manager() locks[output_filename] = managers[output_filename].Lock() lock = locks[output_filename] args.append([ f, treename, foldername, output_filename, model_path, full_output, channel, mode, lock