def OutputNewData(self,input_dir,list_sample,path_output,variables=None): """ Given a model, produce the output The Network has never seen this data ! """ # Loop over datasets # logging.info('Input directory : %s'%input_dir) for f in list_sample: name = os.path.basename(f) full_path = os.path.join(input_dir,f) logging.info('Looking at %s'%f) # Get the data # if variables is None: var = parameters.inputs+parameters.outputs+parameters.other_variables else: var = copy.deepcopy(variables) # Avoid bug where variables is changed at each new file if self.generator: data = None else: data = Tree2Pandas(input_file=full_path, variables=var, weight=parameters.weights, cut = parameters.cut, reweight_to_cross_section=False) if data.shape[0]==0: logging.info('\tEmpty tree') continue # Avoids empty trees self.OutputFromTraining(data=data,path_output=path_output,output_name=name)
def _recoverFromROOT(self): branches = [ 'll_M', 'jj_M', 'lljj_M', 'total_weight', 'Prob_global_HToZA', ('Prob_param_HToZA_mH_%0.2f_mA_%0.2f' % (self.mH, self.mA)).replace('.', 'p') ] #list_variables = ListBranches(self.save_path) #self.data = Tree2Pandas(input_file=self.save_path,variables=list_variables) self.data = Tree2Pandas(input_file=self.save_path, variables=branches) self.size = self.data.shape[0] print('Output recovered from : ' + self.save_path) print('\tSample size : %d' % self.size) self._memoryUsage()
def _getData(self): data = [] manager = enlighten.get_manager() pbar = manager.counter(total=len(self.list_files), desc='Progress', unit='File') for f in self.list_files: df = Tree2Pandas(input_file=f, variables=self.variables, cut=self.cuts, reweight_to_cross_section=True, n=self.end, start=self.start, tree_name='t') self._memoryUsage() data.append(df) pbar.update() manager.stop() self.data = pd.concat(data, axis=0) self.size = self.data.shape[0] print('Sample size : %d' % (self.size)) if self.size == 0: sys.exit('Empty tree')
def MakeScaler(data=None, list_inputs=[], generator=False, batch=5000, list_samples=None, additional_columns={}): logging.info('Starting computation for the scaler') # Generate scaler # scaler = preprocessing.StandardScaler() if not os.path.exists(parameters.scaler_path): # Not generator # if data is not None: scaler.fit(data[list_inputs]) # For generator # if generator: if list_samples is None: raise RuntimeError( "Generator mask asked, you need to provide a sample list") logging.info("Computing mean") # Mean Loop # mean = np.zeros(len(list_inputs)) Ntot = 0 pbar = enlighten.Counter(total=len(list_samples), desc='Mean', unit='File') for f in list_samples: pbar.update() if not os.path.exists(f): continue file_handle = TFile.Open(f) if not file_handle.GetListOfKeys().Contains( parameters.tree_name): continue tree = file_handle.Get(parameters.tree_name) N = tree.GetEntries() Ntot += N file_handle.Close() logging.debug("Opening file %s (%d entries)" % (f, N)) # Loop over batches # for i in range(0, N, batch): array = Tree2Pandas(f, list_inputs, start=i, stop=i + batch, additional_columns=additional_columns, tree_name=parameters.tree_name)[[ inp.replace('$', '') for inp in list_inputs ]].astype(np.float32).values mean += np.sum(array, axis=0) mean /= Ntot # Var Loop # logging.info("Computing std") std = np.zeros(len(list_inputs)) pbar = enlighten.Counter(total=len(list_samples), desc='Std', unit='File') for f in list_samples: pbar.update() if not os.path.exists(f): continue file_handle = TFile.Open(f) if not file_handle.GetListOfKeys().Contains( parameters.tree_name): continue tree = file_handle.Get(parameters.tree_name) N = tree.GetEntries() file_handle.Close() logging.debug("Opening file %s (%d entries)" % (f, N)) # Loop over batches # for i in range(0, N, batch): array = Tree2Pandas(f, list_inputs, start=i, stop=i + batch, additional_columns=additional_columns, tree_name=parameters.tree_name)[[ inp.replace('$', '') for inp in list_inputs ]].astype(np.float32).values std += np.sum(np.square(array - mean), axis=0) std = np.sqrt(std / Ntot) # Set manually # scaler.mean_ = mean scaler.scale_ = std # Disable preprocess on onehot variables # scaler.mean_[parameters.mask_op] = 0. scaler.scale_[parameters.mask_op] = 1. # Safe checks # scaler.mean_[np.isnan(scaler.mean_)] = 0. scaler.scale_[np.isnan(scaler.scale_)] = 1. scaler.scale_[scaler.scale_ == 0.] = 1. scaler.var_ = scaler.scale_**2 # Save # with open(parameters.scaler_path, 'wb') as handle: pickle.dump(scaler, handle) logging.info('Scaler %s has been created' % parameters.scaler_name) # If exists, will import it # else: with open(parameters.scaler_path, 'rb') as handle: scaler = pickle.load(handle) logging.info('Scaler %s has been imported' % parameters.scaler_name) # Test the scaler # if data is not None: try: y = scaler.transform(data[list_inputs]) # Compute mean and var for inputs not in onehot encoding # mean_scale = np.mean(y[:, [not m for m in parameters.mask_op]]) var_scale = np.var(y[:, [not m for m in parameters.mask_op]]) if abs(mean_scale) > 0.01 or abs( (var_scale - 1) / var_scale) > 0.1: # Check that scaling is correct to 1% logging.warning( "Something is wrong with scaler '%s' (mean = %0.6f, var = %0.6f), maybe you loaded an incorrect scaler" % (parameters.scaler_name, mean_scale, var_scale)) except ValueError: logging.warning( "Problem with the scaler '%s' you imported, has the data changed since it was generated ?" % parameters.scaler_name)
def OutputNewData(self,input_dir,list_sample,path_output,variables=None): """ Given a model, produce the output The Network has never seen this data ! """ # Loop over datasets # logging.info('Input directory : %s'%input_dir) for f in list_sample: name = os.path.basename(f) full_path = os.path.join(input_dir,f) logging.info('Looking at %s'%f) # Get the data # if variables is None: var = parameters.inputs+parameters.outputs+parameters.other_variables else: var = copy.deepcopy(variables) # Avoid bug where variables is changed at each new file if self.generator: # Get number of events in file # rootfile = ROOT.TFile(full_path) tree = rootfile.Get("tree") N = tree.GetEntries() rootfile.Close() self.generator = False # Do not use the generator since we need the whole array slices = list(range(0,N,parameters.output_batch_size)) if slices[-1] != N: slices += [N] limits = list(zip(slices[:-1], slices[1:])) list_files = [] for idx,(start,stop) in enumerate(limits): logging.debug("Processing tree from %d to %d (total = %d)"%(start,stop,N)) data = Tree2Pandas(input_file=full_path, variables=var, weight=parameters.weights, cut = parameters.cut, reweight_to_cross_section=False, start=start, n = stop) split_file = name.replace('.root','_%d.root'%idx) list_files.append(os.path.join(path_output,split_file)) self.OutputFromTraining(data=data,path_output=path_output,output_name=split_file) # Concatenate the output and remove split # cmd = ["hadd",os.path.join(path_output,name)]+list_files subprocess.run(cmd) logging.info("Produced concatenated file at %s"%(os.path.join(path_output,name))) for split_file in list_files: os.remove(split_file) logging.debug("... Deleted split file %s"%split_file) logging.info("Cleaning of split files done") else: data = Tree2Pandas(input_file=full_path, variables=var, weight=parameters.weights, cut = parameters.cut, reweight_to_cross_section=False) if data.shape[0]==0: logging.info('\tEmpty tree') continue # Avoids empty trees self.OutputFromTraining(data=data,path_output=path_output,output_name=name)
def _recoverFromROOT(self): list_variables = ListBranches(self.save_path) self.data = Tree2Pandas(input_file=self.save_path, variables=list_variables) print('Output recovered from : ' + self.save_path) print('\tSample size : %d' % self.data.shape[0])
def main(): ############################################################################################# # Options # ############################################################################################# parser = argparse.ArgumentParser( description= 'From given set of root files, interpolate the MEM weight to a given mass points' ) parser.add_argument('-f', '--file', action='store', required=True, type=str, default='', help='File (fulle path) to be used') parser.add_argument('--MA', action='store', required=True, type=int, default=0, help='MA value for the interpolation') parser.add_argument('--MH', action='store', required=True, type=int, default=0, help='MH value for the interpolation') parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Show DEGUG logging') opt = parser.parse_args() # Logging # if opt.verbose: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') else: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ############################################################################################# # Make likelihood map # ############################################################################################# # Get events from tree # logging.info('Looking at file %s' % opt.file) variables = [ 'weight_HToZA_mH_200_mA_50', 'weight_HToZA_mH_200_mA_100', 'weight_HToZA_mH_250_mA_50', 'weight_HToZA_mH_250_mA_100', 'weight_HToZA_mH_300_mA_50', 'weight_HToZA_mH_300_mA_100', 'weight_HToZA_mH_300_mA_200', 'weight_HToZA_mH_500_mA_50', 'weight_HToZA_mH_500_mA_100', 'weight_HToZA_mH_500_mA_200', 'weight_HToZA_mH_500_mA_300', 'weight_HToZA_mH_500_mA_400', 'weight_HToZA_mH_650_mA_50', 'weight_HToZA_mH_800_mA_50', 'weight_HToZA_mH_800_mA_100', 'weight_HToZA_mH_800_mA_200', 'weight_HToZA_mH_800_mA_400', 'weight_HToZA_mH_800_mA_700', 'weight_HToZA_mH_1000_mA_50', 'weight_HToZA_mH_1000_mA_200', 'weight_HToZA_mH_1000_mA_500', 'weight_HToZA_mH_2000_mA_1000', 'weight_HToZA_mH_3000_mA_2000', ] N = len(variables) #cuts = " weight_HToZA_mH_200_mA_50>weight_HToZA_mH_200_mA_50_err && weight_HToZA_mH_200_mA_100>weight_HToZA_mH_200_mA_100_err && weight_HToZA_mH_250_mA_50>weight_HToZA_mH_250_mA_50_err && weight_HToZA_mH_250_mA_100>weight_HToZA_mH_250_mA_100_err && weight_HToZA_mH_300_mA_50>weight_HToZA_mH_300_mA_50_err && weight_HToZA_mH_300_mA_100>weight_HToZA_mH_300_mA_100_err && weight_HToZA_mH_300_mA_200>weight_HToZA_mH_300_mA_200_err && weight_HToZA_mH_500_mA_50>weight_HToZA_mH_500_mA_50_err && weight_HToZA_mH_500_mA_100>weight_HToZA_mH_500_mA_100_err && weight_HToZA_mH_500_mA_200>weight_HToZA_mH_500_mA_200_err && weight_HToZA_mH_500_mA_300>weight_HToZA_mH_500_mA_300_98 err && weight_HToZA_mH_500_mA_400>weight_HToZA_mH_500_mA_400_err && weight_HToZA_mH_650_mA_50>weight_HToZA_mH_650_mA_50_err && weight_HToZA_mH_800_mA_50>weight_HToZA_mH_800_mA_50_err && weight_HToZA_mH_800_mA_100>weight_HToZA_mH_800_mA_100_err && weight_HToZA_mH_800_mA_200>weight_HToZA_mH_800_mA_200_err && weight_HToZA_mH_800_mA_400>weight_HToZA_mH_800_mA_400_err && weight_HToZA_mH_800_mA_700>weight_HToZA_mH_800_mA_700_err && weight_HToZA_mH_1000_mA_50>weight_HToZA_mH_1000_mA_50_err && weight_HToZA_mH_1000_mA_200>weight_HToZA_mH_1000_mA_200_err && weight_HToZA_mH_1000_mA_500>weight_HToZA_mH_1000_mA_500_err && weight_HToZA_mH_2000_mA_1000>weight_HToZA_mH_2000_mA_1000_err && weight_HToZA_mH_3000_mA_2000>weight_HToZA_mH_3000_mA_2000_err && weight_HToZA_mH_600_mA_250>weight_HToZA_mH_600_mA_250_err" cuts = '' other_variables = [s for s in ListBranches(opt.file) if s not in variables] events = Tree2Pandas(input_file=opt.file, variables=variables + other_variables, cut=cuts) points = Decoupler( events[variables], list_outputs=variables) # [mH,mA,weight]xlen(variables) for each event points = points.apply(pd.to_numeric) # change dtypes to float64 inter_weight = np.zeros(events.shape[0]) manager = enlighten.get_manager() pbar = manager.counter(total=events.shape[0], desc='Progress', unit='Event') for i in range(events.shape[0]): x = points.iloc[i * N:(i + 1) * N, 1].values # mA is x y = points.iloc[i * N:(i + 1) * N, 0].values # mH is x z = points.iloc[i * N:(i + 1) * N, 2].values # weight is x inst = InterpolationDelaunay(x, y, z) inter_weight[i] = inst.interpolate(opt.MA, opt.MH) pbar.update() manager.stop() inter_weight = np.nan_to_num(inter_weight) # Put nan at 0 # Make output # new_df = pd.DataFrame(inter_weight) new_df.columns = ['inter_HToZA_mH_%d_mA_%d' % (opt.MH, opt.MA)] out_df = pd.concat([events, new_df], axis=1) output = out_df.to_records(index=False, column_dtypes='float64') array2root(output, opt.file.replace('.root', '_delaunay.root'), mode='recreate') print('Output saved as %s' % opt.file.replace('.root', '_delaunay.root'))
def main(): ############################################################################################# # Options # ############################################################################################# parser = argparse.ArgumentParser( description= 'From given set of root files, make different histograms in a root file' ) parser.add_argument('-m', '--model', action='store', required=True, type=str, default='', help='NN model to be used') parser.add_argument('-f', '--file', action='store', required=False, type=str, default='', help='File (full path) to be used') parser.add_argument('--mA', action='store', required=False, type=int, default=0, help='Print as PDf only some of the mass config') parser.add_argument('--mH', action='store', required=False, type=int, default=0, help='Print as PDf only some of the mass config') parser.add_argument('-n', '--number', action='store', required=False, type=int, default=0, help='Number of events to build the likelihood map') parser.add_argument('--xmax', action='store', required=False, type=float, default=1500, help='Maximum values for mA in the graph') parser.add_argument('--ymax', action='store', required=False, type=float, default=1500, help='Maximum values for mH in the graph') parser.add_argument('--xmin', action='store', required=False, type=float, default=0, help='Minimum values for mA in the graph') parser.add_argument('--ymin', action='store', required=False, type=float, default=0, help='Minimum values for mH in the graph') parser.add_argument('--zmin', action='store', required=False, type=float, default=0, help='Minimum values for z axis in the graph') parser.add_argument('--zmax', action='store', required=False, type=float, default=None, help='Maximum values for z axis in the graph') parser.add_argument('--bins', action='store', required=False, type=int, default=100, help='Bins in both the graph axes') parser.add_argument( '--suffix', action='store', required=False, type=str, default='', help= 'Suffix to be added to output name (likelihood_suffix.pdf/.root), default to empty string' ) parser.add_argument('--PDF', action='store_true', required=False, default=False, help='Produce PDF from the root file') parser.add_argument( '--profile', action='store_true', required=False, default=False, help='Whether to make the profile likelihood starting from the TGraph2D' ) parser.add_argument('--zoom', action='store_true', required=False, default=False, help='Zoom the TGraph2D according to given boundaries') parser.add_argument( '--norm', action='store_true', required=False, default=False, help='Use the normalization by the visible cross section') parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Show DEGUG logging') opt = parser.parse_args() # Logging # if opt.verbose: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') else: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ############################################################################################# # Get objects in TFile # ############################################################################################# def getall(d, basepath="/"): "Generator function to recurse into a ROOT file/dir and yield (path, obj) pairs" for key in d.GetListOfKeys(): kname = key.GetName() if key.IsFolder(): for i in getall(d.Get(kname), basepath + kname + '/'): yield i else: yield basepath + kname, d.Get(kname) ############################################################################################# # Profile Likelihood # ############################################################################################# if opt.profile: # Path to graph # path_root = os.path.abspath( os.path.join('PDF', opt.model, 'likelihood_' + opt.suffix + '.root')) path_out = os.path.abspath(os.path.join('PDF', opt.model)) # Load TGraph2D # f = TFile(path_root) graphs = [(key, obj) for (key, obj) in getall(f)] for key, obj in graphs: if key.find('HToZA') == -1: continue mH_value = int(re.findall(r'\d+', key)[2]) mA_value = int(re.findall(r'\d+', key)[3]) if mH_value != opt.mH or mA_value != opt.mA: continue MakeProfile(graph=obj, mH=mH_value, mA=mA_value, N=10000, path=path_out, step=5, slices=10) sys.exit(0) ############################################################################################# # Make PDF # ############################################################################################# def ZoomHist(graph, bins, xmin, xmax, ymin, ymax): x = np.linspace(xmin, xmax, bins) y = np.linspace(ymin, ymax, bins) X, Y = np.meshgrid(x, y) X = X.ravel() Y = Y.ravel() valid = np.logical_and(np.greater_equal(Y, X), np.greater_equal(Y, 125)) X = X[valid] Y = Y[valid] N = X.shape[0] new_graph = TGraph2D(N) manager = enlighten.get_manager() pbar = manager.counter(total=N, desc='Progress', unit='Point') for i in range(N): content = graph.Interpolate(X[i], Y[i]) if content != 0 or Y[i] > 125: new_graph.SetPoint(i, X[i], Y[i], content) pbar.update() new_graph.SetTitle(graph.GetTitle()) return copy.deepcopy(new_graph) if opt.PDF or opt.zoom: path_root = os.path.abspath( os.path.join('PDF', opt.model, 'likelihood_' + opt.suffix + '.root')) path_pdf = os.path.abspath( os.path.join('PDF', opt.model, 'likelihood_' + opt.suffix + '.pdf')) path_zoom = os.path.abspath( os.path.join('PDF', opt.model, 'likelihood_' + opt.suffix + '_zoom.root')) f = TFile(path_root) if opt.PDF: c1 = TCanvas('c1', 'c1', 1100, 900) c1.SetGrid() #canvas = TCanvas('canvas','canvas',900,900) #canvas.Print(path_pdf+'[') c1.Print(path_pdf + '[') c1.SetTopMargin(0.05) c1.SetBottomMargin(0.18) c1.SetLeftMargin(0.18) c1.SetRightMargin(0.2) graphs = [(key, obj) for (key, obj) in getall(f)] graphs = sorted(graphs, key=lambda tup: tup[0]) # Sort according to name for i, (key, obj) in enumerate(graphs): mH_value = 0 mA_value = 0 if key.find('DY') != -1: title = 'DY' elif key.find('TT') != -1: title = 'TT' else: mH_value = int(re.findall(r'\d+', key)[2]) mA_value = int(re.findall(r'\d+', key)[3]) title = 'HToZATo2L2B_mH_%d_mA_%d' % (mH_value, mA_value) if (mH_value != opt.mH or mA_value != opt.mA) and opt.mA != 0 and opt.mH != 0: continue logging.info('Processing %s' % key) try: if opt.zoom: new_graph = ZoomHist(obj, opt.bins, opt.xmin, opt.xmax, opt.ymin, opt.ymax) if os.path.exists(path_zoom): root_file = TFile(path_zoom, "update") new_graph.Write(title, TObject.kOverwrite) logging.info("Zoomed Graph saved in %s" % path_zoom) else: root_file = TFile(path_zoom, "recreate") new_graph.Write(title) logging.info("Zoomed Graph replaced in %s" % path_zoom) if opt.PDF: base_hist = TH2F('', '', opt.bins, opt.xmin, opt.xmax, opt.bins, opt.ymin, opt.ymax) obj.SetHistogram(base_hist) hist = obj.GetHistogram() hist.SetContour(1000) hist.GetXaxis().SetRangeUser(opt.xmin, opt.xmax) hist.GetYaxis().SetRangeUser(opt.ymin, opt.ymax) hist.SetMinimum(max(opt.zmin, hist.GetMinimum())) amax = hist.GetMaximum() if opt.zmax is None else opt.zmax hist.SetMaximum(amax) hist.SetTitle(";M_{A} [GeV];M_{H} [GeV];-2 log L") hist.GetXaxis().SetTitleOffset(1.2) hist.GetYaxis().SetTitleOffset(1.2) hist.GetZaxis().SetTitleOffset(1.2) hist.GetXaxis().SetLabelSize(0.04) hist.GetYaxis().SetLabelSize(0.04) hist.GetZaxis().SetLabelSize(0.04) hist.GetXaxis().SetTitleSize(0.06) hist.GetYaxis().SetTitleSize(0.06) hist.GetZaxis().SetTitleSize(0.06) hist.Draw('colz') text = TPaveText(.55, .2, .80, .4, 'brNDC') text.AddText("Events with") text.AddText("M_{A} = %d GeV" % mA_value) text.AddText("M_{H} = %d GeV" % mH_value) text.SetTextColor(1) text.SetFillStyle(4100) text.Draw("same") #hist.SetTitle(obj.GetTitle()) c1.Print( path_pdf, 'Title:' + key.replace('.root', '').replace('/', '')) except Exception as e: logging.critical('Could not save %s due to error "%s"' % (key, e)) if opt.PDF: # canvas.Print(path_pdf) # canvas.Print(path_pdf+']') logging.info('PDF saved as %s' % path_pdf) c1.Print(path_pdf + ']') sys.exit() ############################################################################################# # Make likelihood map # ############################################################################################# # Get events from tree # logging.info('Looking at file %s' % opt.file) variables = [ 'lep1_p4.Pt()', 'lep1_p4.Eta()', 'lep2_p4.Pt()', 'lep2_p4.Eta()', 'lep2_p4.Phi()-lep1_p4.Phi()', 'jet1_p4.Pt()', 'jet1_p4.Eta()', 'jet1_p4.Phi()-lep1_p4.Phi()', 'jet2_p4.Pt()', 'jet2_p4.Eta()', 'jet2_p4.Phi()-lep1_p4.Phi()', 'met_pt', 'met_phi-lep1_p4.Phi()', ] events = Tree2Pandas(input_file=opt.file, variables=variables, n=opt.number).values if events.shape[0] == 0: raise RuntimeError("Did you forget -n ?") # Instantiate the map # likelihood = LikelihoodMap(name=opt.model, xmin=opt.xmin, ymin=opt.ymin, xmax=opt.xmax, ymax=opt.ymax, N=300, normalize=opt.norm) # Loop over events # logging.info('Adding events') manager = enlighten.get_manager() pbar = manager.counter(total=opt.number, desc='Progress', unit='Event') for i in range(events.shape[0]): likelihood.AddEvent(events[i, :]) pbar.update() manager.stop() # Make and print map # likelihood.MakeGraph(title=os.path.basename(opt.file), suffix=opt.suffix)