def get_all_vars_in_rds(rds_name): from ROOT import TFile the_rds = TFile(rds_name) tree = the_rds.Get('SVTree') leaf_dict = utils.get_leaves_in_tree(tree) all_leaf_names = [] for name_list in leaf_dict.values(): all_leaf_names += name_list return all_leaf_names
def run(self): reduced_dataset = self._reduced_dataset working_dir = self._working_dir training_variables = self._training_variables do_test = self._do_test # --- profiling profile_dir = os.path.join(working_dir,'profile') if not os.path.isdir(profile_dir): os.mkdir(profile_dir) profile_file = os.path.join(profile_dir, 'profiled.root') if not os.path.isfile(profile_file): rds_dir = os.path.split(reduced_dataset)[0] alt_profile_file = os.path.join(rds_dir, 'profiled.root') if os.path.isfile(alt_profile_file): profile_file = alt_profile_file mean_rms_file = os.path.join(profile_dir, 'mean_rms.txt') if not do_test: if not os.path.isfile(mean_rms_file): if not os.path.isfile(profile_file): print '--- making profile file for normalization ---' profile.make_profile_file(reduced_dataset, profile_file) profile.build_mean_rms_from_profile( profile_file = profile_file, text_file_name = mean_rms_file) # --- training part training_dir = os.path.join(working_dir,self._training_subdir) if not os.path.isdir(training_dir): os.mkdir(training_dir) normalization_file = os.path.join(training_dir, 'normalization.txt') if not os.path.isfile(normalization_file) and not do_test: if not os.path.isfile(profile_file): profile.make_profile_file(reduced_dataset, profile_file) profile.make_normalization_file( profile_file, normalization_file = normalization_file, whitelist = training_variables) normalization_dict = {} if os.path.isfile(normalization_file): with open(normalization_file) as norm_file: for line in norm_file: line = line.strip() if not line: continue name = line.split()[0] offset, scale = (float(n) for n in line.split()[1:]) normalization_dict[name] = (offset, scale) print 'normalization:' text_size = max(len(x) for x in normalization_dict.keys()) + 1 for value, (offset, scale) in normalization_dict.iteritems(): print '%-*s offset: % -10.4g scale: % -10.4g' % ( text_size, value, offset, scale) weights_path = os.path.join(training_dir, 'weightMinimum.root') if not os.path.isfile(weights_path): print '--- running training ---' training.run_training(reduced_dataset = reduced_dataset, output_directory = training_dir, normalization = normalization_dict, flavor_weights = self._flavor_weights, nodes = self._nodes, debug = do_test, events = self._n_training_events, other_opt_dict = self._other_opt_dict) # --- diagnostics part testing_dir = os.path.join(working_dir, self._testing_subdir) if not os.path.isdir(testing_dir): os.mkdir(testing_dir) if not self._testing_ds: self._testing_ds = reduced_dataset augmented_tree = os.path.join(testing_dir, 'perf_ntuple.root') if not os.path.isfile(augmented_tree): print '--- augmenting reduced dataset with nn classifiers ---' # should wrap this in a function to close the file when done from ROOT import TFile testing_ds_file = TFile(self._testing_ds) the_tree = testing_ds_file.Get('SVTree') all_vars_in_tree = utils.get_leaves_in_tree(the_tree) # --- filter out branches we don't care about output_subset = ['bottom','light','charm','JetPt','JetEta'] subset_regex = '|'.join([ '^discriminator(?!Jet)', '^log[BCU][bcu]', ]) branch_filter = re.compile(subset_regex) for branch in itertools.chain(*all_vars_in_tree.values()): if branch_filter.findall(branch): output_subset.append(branch) from jetnet import pynn pynn.augment_tree( in_file = self._testing_ds, nn_file = weights_path, out_file = augmented_tree, ints = all_vars_in_tree['Int_t'], doubles = all_vars_in_tree['Double_t'], extension = self._augment_extension , subset = output_subset, show_progress = True) profiled_path = os.path.splitext(augmented_tree)[0] + '_profile.root' if not os.path.isfile(profiled_path): print '--- profiling performance ntuple ---' profile.make_profile_file(reduced_dataset = augmented_tree, profile_file = profiled_path) output_paths = { 'profile': profiled_path, 'perf_ntuple': augmented_tree, } if self._do_more_diagnostics is not None: warn("do_more_diagnostics doesn't do anything now, please remove", SyntaxWarning, stacklevel = 5, # stacklevel 5 needed to get through the multiprocess call ) self.out_queue.put(output_paths)
from jetnet import pynn, utils import sys, argparse from ROOT import TFile if __name__ == '__main__': if not ( 3 <= len(sys.argv) <= 4 ): sys.exit('motheruker: <in file> <nn file> [<output file>]') input_file = sys.argv[1] nn_file = sys.argv[2] try: output_file = sys.argv[3] except IndexError: output_file = '.'.join(input_file.split('.')[:-1]) + '_aug.root' print 'making', output_file tree = 'SVTree' the_file = TFile(input_file) the_tree = the_file.Get(tree) all_vars_in_tree = utils.get_leaves_in_tree(the_tree) pynn.augment_tree( in_file = input_file, nn_file = nn_file, out_file = output_file, ints = all_vars_in_tree['Int_t'] , doubles = all_vars_in_tree['Double_t'] , )
def get_allowed_rds_variables( input_files, jet_collection = 'AntiKt4TopoEMJetsReTagged', full_dir_name = None, whitelist = None): """ return a tuple of (double_variables, int_variables), checking the first of input_files for matches. If full_dir_name is given jet_collection is ignored. Mon Jun 11 11:37:31 CEST 2012: Added some warnings wrt deprecation of jet_collection. """ from ROOT import TFile sample_root_file = TFile(input_files[0]) if jet_collection != 'AntiKt4TopoEMJetsReTagged': if full_dir_name: warn("specifying full_dir_name will overwrite jet_collection", SyntaxWarning, stacklevel = 2) else: warn("jet_collection is going to be replaced, use full_dir_name", FutureWarning, stacklevel = 2) if full_dir_name: input_tree_name = full_dir_name + '/PerfTreeAll' else: input_tree_name = ( 'BTag_%s_JetFitterTagNN/PerfTreeAll' % (jet_collection + 'AOD') ) sample_tree = sample_root_file.Get(input_tree_name) # stupid bug fix # FIXME: fix stupid bug fix if sample_tree == None: warn("could not find %s, trying other stuff" % input_tree_name) input_tree_name = ( '%s_JetFitterCharm/PerfTreeAll' % (jet_collection) ) maybe_tree = sample_root_file.Get(input_tree_name) if maybe_tree != None: sample_tree = maybe_tree if sample_tree == None: raise IOError("could not find %s" % input_tree_name) leaves_dict = utils.get_leaves_in_tree(sample_tree) if whitelist: warn("whitelist is going to be removed for the sake of transparency" " you probably shouldn't use it", FutureWarning, stacklevel = 2) double_variables = whitelist int_variables = whitelist double_variables = [ x for x in double_variables if x in leaves_dict['Double_t'] ] int_variables = [ x for x in int_variables if x in leaves_dict['Int_t'] ] else: int_variables = leaves_dict['Int_t'] int_variables.remove('Flavour') double_variables = leaves_dict['Double_t'] double_variables.remove('Discriminator') # warn("no whitelist given, will return all vars", stacklevel = 2) # for flav in 'buc': # double_variables.remove('Likelihood_' + flav) # for slimvar in ['JetPt','JetEta','mass']: # double_variables.remove(slimvar) return double_variables, int_variables