def predict(data, features, baseName, opt): """ runs the prediction for the trained models and dumps a tree """ print '[predict] with', baseName, 'with', len(data), 'events' #load models and standard scaler with open(opt.model, 'r') as cache: best_models = pickle.load(cache) scaler = pickle.load(cache) #scale data and switch to pandas DataFrame df = pd.DataFrame(scaler.transform(data)) df.columns = features #run all predictions pred = pd.DataFrame() for key in best_models: if key != 'rfc': continue for xangle in best_models[key]: tag = '%s_%d' % (key, xangle) print tag, 'for', baseName clf = best_models[key][xangle][0] features = best_models[key][xangle][-1] y_prob = clf.predict_proba(df[features])[:, 0] pred[tag] = y_prob #write to output rp.to_root(pred, baseName, key='pudiscr', store_index=False) if opt.output: os.system('xrdcp -f {0} root://eoscms//{1}/{0}'.format( baseName, opt.output.replace('/eos/cms/', ''))) os.system('rm {0}'.format(baseName))
def store_dataframe(df, outfile, tname='chi2_values'): """ Store the dataframe either into a pkl file or into a root file via root_pandas. """ logging.debug('Storing DataFrame to {}'.format(outfile)) if not outfile.endswith('.pkl') and not outfile.endswith('.root'): logging.warning('Output file doesnot have .root or .pkl format. ' 'Creating a .pkl file instead') logging.debug('Output filename before substitution: {}'.format(outfile)) import re outfile = re.sub(r'(.*\.)(\w*)$', r'\1pkl', outfile) logging.debug('Output filename after substitution: {}'.format(outfile)) logging.info('Writing resulting DataFramet to: {}'.format(outfile)) # if .root is requested check if root_pandas is here, otherwise go to .pkl if outfile.endswith('.root'): try: from root_pandas import to_root # current version of to_root doesnot support the store_index argument to_root(df, outfile, tname, mode='w'# , store_index=False ) except ImportError: logging.warning('Output to .root file was requested, but root_pandas' ' was not found. Creating a .pkl file instead') outfile = outfile.replace('.pkl', '.root') if outfile.endswith('.pkl'): df.to_pickle(outfile)
def write_to_EPM(output, dfCols = None, trueTag = None, fileName = 'tagsToEPM.root'): ''' Takes classifier output (in [0, 1]), associated true tag associations (as PDG MC IDs) in a Pandas Series or DataFrame, along with (DataFrame) other variables to be written to the resulting ROOT ifle. Writes a ROOT file that can be imported into Espresso Performance Monitor. ''' try: import pandas as pd from root_pandas import to_root except ImportError: print('ERROR: Cannot import from root_pandas - no ROOT files have been written.') return decisions, mistags = decision_and_mistag(output) if type(dfCols) == pd.Series: dfCols = dfCols.to_frame() elif dfCols is None or type(dfCols) != pd.DataFrame: dfCols = pd.DataFrame() dfCols['tag'] = decisions.flatten().astype(np.int32) # Short_t dfCols['eta'] = mistags.flatten().astype(np.double) # Float_t if not trueTag is None: dfCols['truth'] = trueTag.flatten().astype(np.int32) # Short_t to_root(dfCols, fileName, key = 'tree')
def test_issue_80(): df = pd.DataFrame({'a': [1, 2], 'b': [4, 5]}) df.columns = ['a', 'a'] try: root_pandas.to_root(df, '/tmp/example.root') except ValueError as e: assert 'DataFrame contains duplicated column names' in e.args[0] else: raise Exception('ValueError is expected')
def test_issue_60(): df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))}) root_pandas.to_root(df, 'tmp_1.root', 'my_tree_1') root_pandas.to_root(df, 'tmp_2.root', 'my_tree') result = root_pandas.read_root(['tmp_1.root', 'tmp_2.root'], 'my_tree', warn_missing_tree=True) assert len(result) == 10 os.remove('tmp_1.root') os.remove('tmp_2.root')
def test_detect_branches_first_missing(): df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))}) to_root(df, 'tmp_1.root', 'my_tree_1') to_root(df, 'tmp_2.root', 'my_tree') read_df = read_root(['tmp_1.root', 'tmp_2.root'], 'my_tree', warn_missing_tree=True) assert_frame_equal(df, read_df) os.remove('tmp_1.root') os.remove('tmp_2.root')
def run(self): df = root_pandas.read_root(*self.get_input_file_names('train.root'), key=self.tree_name) # resample resampled_df = resample(df, random_state=self.random_seed) # store to root root_pandas.to_root(resampled_df, self.get_output_file_name('train.root'), key=self.tree_name)
def run(self): train, test = split_sample(ntuple_file=self.ntuple_file, train_size=self.train_size, test_size=self.test_size) # Store as Rootfile root_pandas.to_root(train, self.get_output_file_name('train.root'), key=self.tree_name) root_pandas.to_root(test, self.get_output_file_name('test.root'), key=self.tree_name)
def test_issue_63(): df = pd.DataFrame({'a': [], 'b': []}) root_pandas.to_root(df, 'tmp_1.root', 'my_tree') df = pd.DataFrame({'a': list(range(10)), 'b': list(range(10))}) root_pandas.to_root(df, 'tmp_2.root', 'my_tree') result = list( root_pandas.read_root(['tmp_1.root', 'tmp_2.root'], 'my_tree', where='a > 2', chunksize=1)) assert len(result) == 7 assert all(len(df) == 1 for df in result) os.remove('tmp_1.root') os.remove('tmp_2.root')
def run(self): expert = root_pandas.read_root( self.get_input_file_names('validation_expert.root')) # normalize to len_data / len_mc (off-res.) key_EventType = expert.keys()[1] len_data = len(expert[expert[key_EventType] == 1]) len_mc = len(expert) - len_data weights = get_weights(expert_df=expert, normalize_to=len_data / len_mc) root_pandas.to_root( weights, self.get_output_file_name('validation_weights.root'), key='weights')
def run(self): # calculate the normalization from ValadiatonReweighting output validation_weights = root_pandas.read_root( self.get_input_file_names("validation_weights.root")) len_data = len( validation_weights[validation_weights["EventType"] == 1]) len_mc = len(validation_weights) - len_data expert = root_pandas.read_root( self.get_input_file_names('expert.root')) weights = get_weights(expert_df=expert, normalize_to=len_data / len_mc) root_pandas.to_root(weights, self.get_output_file_name('weights.root'), key=self.tree_name)
def store_dataframe(dfr, outfile, tname='chi2_values', **kwargs): """ Store the dataframe either into a pkl file or into a root file via root_pandas. Args: dfr (pandas.DataFrame): The dataframe that should be stored outfile (str): The filename to which the DataFrame should be stored. If this ends with .pkl, a pkl file will be created, if it ends on .root a root file will be created (if root_pandas is available), Otherwise a .pkl file will be created with .root replaced with .pkl tname (str, optional): Name of the TTree to be used for storing the DataFrame if stored to a root file Keyword Args: Forwarded to root_pandas.to_root See Also: root_pandas.to_root """ logging.debug('Storing DataFrame to {}'.format(outfile)) if not outfile.endswith('.pkl') and not outfile.endswith('.root'): logging.warning('Output file doesnot have .root or .pkl format. ' 'Creating a .pkl file instead') logging.debug( 'Output filename before substitution: {}'.format(outfile)) outfile = re.sub(r'(.*\.)(\w*)$', r'\1pkl', outfile) logging.debug('Output filename after substitution: {}'.format(outfile)) logging.info('Writing resulting DataFrame to: {}'.format(outfile)) # if .root is requested check if root_pandas is here, otherwise go to .pkl if outfile.endswith('.root'): try: from root_pandas import to_root # current version of to_root doesn't support the store_index argument to_root( dfr, outfile, tname, **kwargs # , store_index=False ) except ImportError: logging.warning( 'Output to .root file was requested, but root_pandas' ' was not found. Creating a .pkl file instead') outfile = outfile.replace('.pkl', '.root') if outfile.endswith('.pkl'): dfr.to_pickle(outfile)
def run(input_fns, output_fn, h1, h2, h3): keys = list_trees(input_fns[0]) assert len(keys) == 1, keys df = read_root(input_fns, keys[0]) df['H1_isMuon'] = df['H1_isMuon'].astype(np.bool) df['H2_isMuon'] = df['H2_isMuon'].astype(np.bool) df['H3_isMuon'] = df['H3_isMuon'].astype(np.bool) # Sort the columns so that the first is the most kaon-like assert sorted([h1, h2, h3 ]) == [h1, h2, h3 ], 'Children are ranked from kaon-like to pion-like' order = np.argsort(df[['H3_ProbK', 'H2_ProbK', 'H1_ProbK']], axis=1) for col in [c for c in df.columns if c.startswith('H1_')]: col = col[len('H1_'):] cols = [f'H1_{col}', f'H2_{col}', f'H3_{col}'] df[cols] = df[cols].values[np.arange(order.shape[0])[:, None], order] # Compute the PE and mass of all particles for head, mass in [('H1', mass_dict[h1]), ('H2', mass_dict[h2]), ('H3', mass_dict[h3])]: df.eval(f'{head}_P = sqrt({head}_PX**2 + {head}_PY**2 + {head}_PZ**2)', inplace=True) df.eval(f'{head}_PE = sqrt({mass}**2 + {head}_P**2)', inplace=True) for component in ['PE', 'PX', 'PY', 'PZ']: df.eval( f'B_{component} = H1_{component} + H2_{component} + H3_{component}', inplace=True) df.eval(f'B_M = sqrt(B_PE**2 - B_PX**2 - B_PY**2 - B_PZ**2)', inplace=True) # if [h1, h2, h3] == ['K', 'K', 'K']: # Apply ignore muons df.query('~(H1_isMuon | H2_isMuon | H3_isMuon)', inplace=True) # Apply an additional selection df.query(f'(H1_IPChi2 > 25) & (H2_IPChi2 > 25) & (H3_IPChi2 > 25)', inplace=True) # Apply a PID selection df.query( f'(H1_Prob{h1} > {pid_cut}) & (H2_Prob{h2} > {pid_cut}) & (H3_Prob{h3} > {pid_cut})', inplace=True) to_root(df, output_fn, key=f'B2{h1}{h2}{h3}', mode='w', store_index=False)
def fill_trees(self, branch_names, year, print_yields=False): #have to save individual trees as root files (fn=bn), then hadd over single proc on the command line, to get one proc file with all tag trees debug_cols = [ 'dielectronMass', 'leadElectronPtOvM', 'subleadElectronPtOvM', 'dijetMass', 'leadJetPt', 'subleadJetPt', 'ggH_mva', 'VBF_mva', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag', 'proc', 'tree_name' ] if print_yields: print_str = '*** Yields ***' lumi_map = { '2016A': 35.9, '2016B': 35.9, '2017': 41.5, '2018': 59.7 } for proc in self.true_procs: selected_df = self.combined_df[self.combined_df.proc == proc] if print_yields: print_str += '\n \n Process: {}'.format(proc) for bn in branch_names[proc]: print bn branch_selected_df = selected_df[selected_df.tree_name == bn] print branch_selected_df[debug_cols].head(10) root_pandas.to_root(branch_selected_df[self.tree_vars], 'output_trees/{}/{}_{}.root'.format( year, bn, year), key=bn) if print_yields: if proc is not 'Data': print_str += '\n Summed events in category {}: {}'.format( bn, np.sum(branch_selected_df['weight']) * lumi_map[year] * 1000) else: print_str += '\n Summed events in category {}: {}'.format( bn, np.sum(branch_selected_df['weight'])) print if print_yields: print print_str
def fit(self, X_train, y_train): '''Fit routine. Parameters ---------- X_train : numpy.array, shape=(n_samples, n_obervables) Observable sample. y_train : numpy.array, shape=(n_samples,) Target variable sample. ''' X_train, y_train = super(TRUEEUnfolding, self).fit(X_train, y_train) self.binning_X.fit(X_train, y_train) self.binning_y.fit(y_train) X_digit = self.binning_X.digitize(X_train) y_digit = self.binning_y.digitize(y_train) df_train = pd.DataFrame(np.column_stack((X_digit, y_digit)), columns=['x', 'y']) file_mc = 'temp_truee_train.root' self.tempdir = '_truee_temp_dir' self.path_mc = os.path.join(self.tempdir, file_mc) if not os.path.exists('_truee_temp_dir'): os.mkdir('_truee_temp_dir') rp.to_root(df_train, self.path_mc, 'data') self._config.update(source_file_moca=self.path_mc) self._config.update(roottree_moca='data') self._config.update(branch_y='x') self._config.update(number_y_bins=self.binning_X.n_bins) self._config.update( limits_y='{} {}'.format(-0.5, self.binning_X.n_bins - 0.5)) self._config.update(branch_x='y') self._config.update(number_bins=self.binning_y.n_bins) self._config.update(max_number_bins=self.binning_y.n_bins) self._config.update( limits_x='{} {}'.format(-0.5, self.binning_y.n_bins - 0.5)) self.is_fitted = True
def generate_data( size_mc=500000, size_data=10000, size_mc_offres=150000, size_data_offres=8000, frac_a=0.8): """Generate root files to represent data and MC samples to demonstrate the re-weighting. Parameters: size_mc, size_data, size_mc_offres, size_data_offres: number of events in the corresponding sample. frac_a: fraction of events in componentA Return: data, componentA, componentB, data_offres, componentA_offres: pd.DataFrames of the generated samples. """ frac_b = 1 - frac_a # GENERATE DATA print( "Generating the following dataframes:\n" "data, componentA, componentB, data_offres and componentA_offres ...") # Random state for random number generation rs = np.random.RandomState(seed=1) # on res data = pd.DataFrame() componentA = pd.DataFrame() componentB = pd.DataFrame() # variable1 tmp_data = rs.triangular(0, 1, 1, size=int(size_data*frac_a*0.3)) tmp_data = np.append( tmp_data, rs.normal(0.3, 0.1, int(size_data*frac_b))) tmp_data = np.append( tmp_data, rs.uniform(size=int(size_data*frac_a*0.7))) data["variable1"] = tmp_data data = data.loc[data["variable1"] >= 0] componentA["variable1"] = rs.uniform(size=int(size_mc * frac_a)) componentB["variable1"] = rs.normal( 0.3, 0.1, size=int(size_mc * frac_b)) # variable2 data["variable2"] = rs.uniform(size=len(data)) componentA["variable2"] = rs.uniform(size=int(size_mc*frac_a)) componentB["variable2"] = rs.uniform(size=int(size_mc*frac_b)) # candidate and EventType data["__candidate__"] = [0]*len(data) componentA["__candidate__"] = [0]*len(componentA) componentB["__candidate__"] = [0]*len(componentB) data["EventType"] = [float(1)]*len(data) componentA["EventType"] = [float(0)]*len(componentA) componentB["EventType"] = [float(0)]*len(componentB) # off res data_offres = pd.DataFrame() componentA_offres = pd.DataFrame() # variable1 tmp_data = rs.triangular( 0, 1, 1, size=int(size_data_offres*frac_a*0.3)) tmp_data = np.append( tmp_data, rs.uniform(size=int(size_data_offres*frac_a*0.7))) data_offres["variable1"] = tmp_data componentA_offres["variable1"] = rs.uniform( size=int(size_mc_offres*frac_a)) # variable2 data_offres["variable2"] = rs.uniform(size=len(data_offres)) componentA_offres["variable2"] = rs.uniform( size=int(size_mc_offres*frac_a)) # candidate and EventType data_offres["__candidate__"] = [0]*len(data_offres) componentA_offres["__candidate__"] = [0]*len(componentA_offres) data_offres["EventType"] = [float(1)]*len(data_offres) componentA_offres["EventType"] = [float(0)]*len(componentA_offres) # SAVE DATA print("Saving data to 'example_input/<file>.root' ...") if not os.path.exists("example_input"): os.makedirs("example_input") to_root(data, "example_input/data.root", key="variables") to_root(componentA, "example_input/componentA.root", key="variables") to_root(componentB, "example_input/componentB.root", key="variables") to_root(data_offres, "example_input/data_offres.root", key="variables") to_root( componentA_offres, "example_input/componentA_offres.root", key="variables") return data, componentA, componentB, data_offres, componentA_offres
pid = ham.add_process(Bc2JpsiLNu) pids.append(pid) # pl = ev.mu3_pt # q2 = ev.q2 # ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2]) # print ("chekpoint C", i) ham.process_event() # print ("chekpoint D", i, pid) # import pdb ; pdb.set_trace() #print (pid, ham.get_weight('BGL', [pid])) #print (pid, ham.get_weight('Kiselev', [pid])) # weights.append(ham.get_weight('BGL', [pid])) #weights.append(ham.get_weight('BGL')) weights.append(ham.get_weight('Kiselev')) # print ("chekpoint E", i) if i>maxevents: break reduced_tree = tree_df[:len(weights)] reduced_tree['hammer'] = np.nan_to_num(np.array(weights)) # sone NaNs, check the manual to_root(reduced_tree, 'reweighed_bc_tree_tau.root', key='tree')
def main(options): with open(options.config, 'r') as config_file: config = yaml.load(config_file) output_tag = config['output_tag'] mc_dir = config['mc_file_dir'] mc_fnames = config['mc_file_names'] #data not needed yet, but stil specify in the config for compatibility with constructor data_dir = config['data_file_dir'] data_fnames = config['data_file_names'] proc_to_tree_name = config['proc_to_tree_name'] proc_to_train_vars = config['train_vars'] all_train_vars = [ item for sublist in proc_to_train_vars.values() for item in sublist ] vars_to_add = config['vars_to_add'] #Data handling stuff# #apply loosest selection (ggh) first, else memory requirements are ridiculous. Fine to do this since all cuts all looser than VBF (not removing events with higher priority) loosest_selection = 'dielectronMass > 110 and dielectronMass < 150' #load the mc dataframe for all years. Do not apply any specific preselection root_obj = ROOTHelpers(output_tag, mc_dir, mc_fnames, data_dir, data_fnames, proc_to_tree_name, all_train_vars, vars_to_add, loosest_selection) root_obj.no_lumi_scale() for sig_obj in root_obj.sig_objects: root_obj.load_mc(sig_obj, reload_samples=options.reload_samples) if options.data_as_bkg: for data_obj in root_obj.data_objects: root_obj.load_data(data_obj, reload_samples=options.reload_samples) else: for bkg_obj in root_obj.bkg_objects: root_obj.load_mc(bkg_obj, bkg=True, reload_samples=options.reload_samples) root_obj.concat() #Tag sequence stuff# if options.data_as_bkg: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.data_df]) else: combined_df = pd.concat([root_obj.mc_df_sig, root_obj.mc_df_bkg]) del root_obj #decide sequence of tags and specify preselection for use with numpy.select: tag_sequence = ['VBF', 'ggH'] proc_to_preselection = { 'VBF': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) & combined_df['dijetMass'].gt(350) & combined_df['leadJetPt'].gt(40) & combined_df['subleadJetPt'].gt(30) ], 'ggH': [ combined_df['dielectronMass'].gt(110) & combined_df['dielectronMass'].lt(150) & combined_df['leadElectronPToM'].gt(0.333) & combined_df['subleadElectronPToM'].gt(0.25) ] } with open(options.bdt_config, 'r') as bdt_config_file: config = yaml.load(bdt_config_file) proc_to_model = config['models'] proc_to_tags = config['boundaries'] #evaluate MVA scores used in categorisation for proc, model in proc_to_model.iteritems(): print 'evaluating classifier: {}'.format(model) clf = pickle.load(open('models/{}'.format(model), "rb")) train_vars = proc_to_train_vars[proc] combined_df[proc + '_bdt'] = clf.predict_proba( combined_df[train_vars].values)[:, 1:].ravel() # TAG NUMBER # #decide on tag for proc in tag_sequence: presel = proc_to_preselection[proc] tag_bounds = proc_to_tags[proc].values() tag_masks = [] for i_bound in range( len(tag_bounds)): #c++ type looping for index reasons if i_bound == 0: #first bound, tag 0 tag_masks.append(presel[0] & combined_df['{}_bdt'.format( proc)].gt(tag_bounds[i_bound])) else: #intermed bound tag_masks.append(presel[0] & combined_df['{}_bdt'.format( proc)].lt(tag_bounds[i_bound - 1]) & combined_df[ '{}_bdt'.format(proc)].gt(tag_bounds[i_bound])) mask_key = [icat for icat in range(len(tag_bounds))] combined_df['{}_analysis_tag'.format(proc)] = np.select( tag_masks, mask_key, default=-999) # PROC PRIORITY # # deduce tag priority: if two or more tags satisfied then set final tag to highest priority tag. make this non hardcoded i.e. compare proc in position 1 to all lower prioty positions. then compare proc in pos 2 ... tag_priority_filter = [ combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].ne(-999), # 1) if both filled... combined_df['VBF_analysis_tag'].ne(-999) & combined_df['ggH_analysis_tag'].eq( -999), # 2) if VBF filled and ggH not, take VBF combined_df['VBF_analysis_tag'].eq(-999) & combined_df['ggH_analysis_tag'].ne( -999), # 3) if ggH filled and VBF not, take ggH ] tag_priority_key = [ 'VBF', #1) take VBF 'VBF', #2) take VBF 'ggH', #3) take ggH ] combined_df['priority_tag'.format(proc)] = np.select( tag_priority_filter, tag_priority_key, default='NOTAG') # else keep -999 i.e. NOTAG #some debug checks: #print combined_df[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.VBF_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] #print combined_df[combined_df.ggH_analysis_tag>-1][['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']] # FILL TREES BASED ON BOTH OF ABOVE tree_vars = ['dZ', 'CMS_hgg_mass', 'weight'] combined_df['dZ'] = float(0.) combined_df['CMS_hgg_mass'] = combined_df['dielectronMass'] # FIXME: dont loop through events eventually but for now I cba to use numpy to vectorise it again #for true_proc in tag_sequence+['Data']: # #isolate true proc # true_proc_df = combined_df[combined_df.proc==true_proc.lower()] # #how much true proc landed in each of our analysis cats? # for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? # true_proc_target_proc_df = true_proc_df[true_proc_df.priority_tag==target_proc] # for i_tag in range(len(proc_to_tags[target_proc].values())):#for each tag corresponding to the category we target, which events go in which tag # true_procs_target_proc_tag_i = true_proc_target_proc_df[true_proc_target_proc_df['{}_analysis_tag'.format(target_proc)].eq(i_tag)] # # branch_name = '{}_125_13TeV_{}cat{} :'.format(true_proc.lower(), target_proc.lower(), i_tag ) # print true_procs_target_proc_tag_i[['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag']].head(10) # print branch_name #get tree names branch_names = {} #print 'DEBUG: {}'.format(np.unique(combined_df['proc'])) for true_proc in tag_sequence + ['Data']: branch_names[true_proc] = [] for target_proc in tag_sequence: #for all events that got the proc tag, which tag did they fall into? for i_tag in range( len(proc_to_tags[target_proc].values()) ): #for each tag corresponding to the category we target, which events go in which tag if true_proc is not 'Data': branch_names[true_proc].append( '{}_125_13TeV_{}cat{}'.format( true_proc.lower(), target_proc.lower(), i_tag)) else: branch_names[true_proc].append( '{}_13TeV_{}cat{}'.format(true_proc, target_proc.lower(), i_tag)) #debug_procs = ['dipho_mass', 'dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 'dipho_sublead_ptoM', 'dijet_Mjj', 'dijet_LeadJPt', 'dijet_SubJPt', 'ggH_bdt', 'VBF_bdt', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag'] debug_vars = [ 'proc', 'VBF_analysis_tag', 'ggH_analysis_tag', 'priority_tag' ] combined_df['tree_name'] = combined_df.apply(assign_tree, axis=1) print combined_df[debug_vars + ['tree_name']] if not path.isdir('output_trees/'): print 'making directory: {}'.format('output_trees/') system('mkdir -p %s' % 'output_trees/') #have to save individual trees then hadd procs together on the command line. for proc in tag_sequence + ['Data']: selected_df = combined_df[combined_df.proc == proc] for bn in branch_names[proc]: print bn branch_selected_df = selected_df[selected_df.tree_name == bn] print branch_selected_df[debug_vars + ['tree_name']].head(20) root_pandas.to_root(branch_selected_df[tree_vars], 'output_trees/{}.root'.format(bn), key=bn) print
pids.append(pid) # pl = ev.mu3_pt # q2 = ev.q2 # ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2]) # print ("chekpoint C", i) ham.process_event() # print ("chekpoint D", i, pid) # import pdb ; pdb.set_trace() #print (pid, ham.get_weight('BGL', [pid])) #print (pid, ham.get_weight('Kiselev', [pid])) # weights.append(ham.get_weight('BGL', [pid])) #weights.append(ham.get_weight('BGL')) weights.append(ham.get_weight('Kiselev')) # print ("chekpoint E", i) if i > maxevents: break reduced_tree = tree_df[:len(weights)] reduced_tree['hammer'] = np.nan_to_num( np.array(weights)) # sone NaNs, check the manual to_root(reduced_tree, 'reweighed_bc_tree_tau_EFTtoKis_14Apr21_1vertx.root', key='tree')
pid = ham.add_process(Bc2JpsiLNu) pids.append(pid) pl = ev.mu3_pt q2 = ev.q2 # ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2]) # print ("chekpoint C", i) ham.process_event() # print ("chekpoint D", i, pid) # import pdb ; pdb.set_trace() #print (pid, ham.get_weight('BGL', [pid])) #print (pid, ham.get_weight('Kiselev', [pid])) # weights.append(ham.get_weight('BGL', [pid])) #weights.append(ham.get_weight('BGL')) weights.append(ham.get_weight('Kiselev')) # print ("chekpoint E", i) if i>maxevents: break reduced_tree = tree_df[:len(weights)] reduced_tree['hammer'] = np.nan_to_num(np.array(weights)) # sone NaNs, check the manual to_root(reduced_tree, 'reweighed_bc_tree_fromEfgtoKis.root', key='tree')
##### ETA BINS ########################################################################################## @np.vectorize def tauEta(eta): if abs(eta) > 2.1: return 7 elif abs(eta) > 1.8: return 6 elif abs(eta) > 1.5: return 5 elif abs(eta) > 1.1: return 4 elif abs(eta) > 0.8: return 3 elif abs(eta) > 0.5: return 2 elif abs(eta) > 0.2: return 1 else: return 0 features.append('tauEta') sigW['tauEta'] = tauEta(sigW['cand_refit_tau_eta']) bkg['tauEta'] = tauEta(bkg['cand_refit_tau_eta']) ########################################################################################## data = pd.concat([sigW, bkg], ignore_index=True, sort=True) #data['id'] = np.arange(len(data)) train, test = train_test_split(data, test_size=0.4, random_state=1986) ## assign an id to the test and train sets seprately to avoid mismatch when folding train.insert(len(train.columns), 'id', np.arange(len(train))) test.insert(len(test.columns), 'id', np.arange(len(test))) if __name__ == '__main__': print "[INFO] Interactive mode: saving dataset to disk" import root_pandas root_pandas.to_root(data, 'dataframe.root', key='tree')
'mu1_softID', 'mu2_softID', 'k_tightID', 'k_mediumID', 'k_softID', #'mu1_isPF', #'mu2_isPF', #'k_isPF', ] for k, v in samples.items(): #for k in ['psi2s_tau']: for new_column, new_definition in to_define: if samples[k].HasColumn(new_column): continue samples[k] = samples[k].Define(new_column, new_definition) # convert to pandas samples[k] = pd.DataFrame(samples[k].AsNumpy()) for icolumn in to_cast: if not math.isnan(samples[k][icolumn][0]): samples[k][icolumn] = samples[k][icolumn].astype(int) print('enrich the data', k) for i, label in zip(range(3), ['mu', 'tau', 'bkg']): samples[k]['bdt_%s' %label] = model.predict_proba(samples[k][features])[:,i] to_root(samples[k], '%s/%s_bdtenriched.root' %(tree_dir, k), key='BTo3Mu', store_index=False)
def predict(self, X, n_knots, n_dof, data_luminosity=1.0, moca_luminosity=1.0, moca_weight=1.0, fx_positive=False, smooth_x=False, zero_left=False, zero_right=False, constraints='', weight_first=0, cleanup=True, **kwargs): '''Calculates an estimate for the unfolding by calling TRUEE. Parameters ---------- X : numpy.array, shape=(n_samples, n_obervables) Observable sample. n_knots : int Number of knots for the spline representation used in TRUEE. Rule of thumb: Should be about twice the number of bins in the target variable space. n_dof : int Number of degrees of freedom, the more, the less regularized the unfolding. data_luminosity : float I guess weights for X? moca_luminosity : float I guess weights for y? fx_positive : bool Whether to enforce positive results for the unfolded spectrum. smooth_x : bool Whether to smooth ... the observable vector? I don't know. zero_left, zero_right : bool I think supposedly, this is supposed to set the left/right-most bin to zero. However, I don't think it does anything at all constraints : str A string containing a C-style formula (without spaces!). No idea. weight_first : int Who knows cleanup : bool Whether or not to delete all temporary files after TRUEE was called. Returns ------- result : ``pyunfolding.utils.UnfoldingResult`` object The result of the unfolding, see documentation for `UnfoldingResult`. ''' if not self.is_fitted: raise RuntimeError( 'Unfolding not yet fitted. Use `fit` routine first.') X = super(TRUEEUnfolding, self).predict(X) # Storing parameters to config dictionary self._config.update(number_deg_free=n_dof) self._config.update(max_number_deg_free=n_dof) self._config.update(number_knots=n_knots) self._config.update(max_number_knots=n_knots) self._config.update(data_luminosity=data_luminosity) self._config.update(moca_luminosity=moca_luminosity) self._config.update(moca_weight=moca_weight) self._config.update(fx_positive=int(fx_positive)) self._config.update(smooth_x=int(smooth_x)) self._config.update(zero_left=int(zero_left)) self._config.update(zero_right=int(zero_right)) self._config.update(constraints=constraints) self._config.update(weight_first=weight_first) X_digit = self.binning_X.digitize(X) file_dt = 'temp_truee_test.root' self.path_dt = os.path.join(self.tempdir, file_dt) df_test = pd.DataFrame(np.column_stack( (X_digit, np.zeros(len(X_digit)))), columns=['x', 'y']) rp.to_root(df_test, self.path_dt, 'data') self._config.update(roottree_data='data') self._config.update(source_file_data=self.path_dt) # Write config file and run TRUEE file_conf = 'parameters.config' self.path_conf = os.path.join(self.tempdir, file_conf) self._write_config_file(self.path_conf) os.system('{} {}'.format(self.TRUEE_CALL, self.path_conf)) f = ROOT.TFile.Open(self.TRUEE_RESULT) g = f.GetDirectory('RealDataResults') string = 'bins_{}_knots_{}_degFree_{}'.format(self.binning_y.n_bins, n_knots, n_dof) cov = np.array([[ g.Get('Tcovar_matrix_{};1'.format(string))(i, j) for i in range(self.binning_y.n_bins) ] for j in range(self.binning_y.n_bins)]) h = g.Get('events_result_{};1'.format(string)) f_vals = np.array( [h.GetBinContent(i) for i in range(self.binning_y.n_bins + 1)]) f_err = np.sqrt(cov.diagonal()) # Cleanup temp files if cleanup: os.remove(self.path_mc) os.remove(self.path_dt) os.remove(self.path_conf) os.remove(self.TRUEE_RESULT) os.rmdir(self.tempdir) # I'm not sure why this is necessary, but it is. And it's not an elegant solution either. scaling = np.sum(f_vals) / len(X) return UnfoldingResult(f=f_vals[1:] / scaling, f_err=np.vstack((f_err, f_err)) / scaling, cov=cov, binning_y=self.binning_y, success=True)
# Target for the regression to predict the correction factor data['target'] = data.genJetPt / data.jetPt # Additional selections to limit phase space #data = data[(np.abs(data.jetEta) < 1.3) & (data.genJetPt > 60.) & ((data.target > 0.9) & (data.target < 1.1))] data = data[(np.abs(data.jetEta) < 1.3) & (data.genJetPt > 60.)] # Split into set used for training and validation, and a separate test sets 0.9/0.1 training, test = train_test_split(data, shuffle=True, test_size=0.1) test.reset_index(drop=True, inplace=True) training.reset_index(drop=True, inplace=True) training = training[((training.target > 0.9) & (training.target < 1.1))] # Save test data to a separate file for post training plotting to_root(test, 'test_data.root', key='tree') # Scale input variables for training and save scaler for future use in plotting scaler = MinMaxScaler().fit(training[Training_variables].values) dump(scaler, "scaler.pkl") train_inp = pd.DataFrame(scaler.transform(training[Training_variables].values), columns=Training_variables) train_trg = training['target'] # Prepare test data for monitoring plots test_true = test[[ 'isPhysUDS', 'isPhysG', 'genJetPt', 'jetPt', 'QG_ptD', 'QG_axis2', 'QG_mult' ]] test_inp = pd.DataFrame(scaler.transform(test[Training_variables].values), columns=Training_variables)
pids.append(pid) pl = ev.mu3_pt q2 = ev.q2 # ham.fill_event_histogram("pEllVsQ2:Bc", [pl, q2]) # print ("chekpoint C", i) ham.process_event() # print ("chekpoint D", i, pid) # import pdb ; pdb.set_trace() #print (pid, ham.get_weight('BGL', [pid])) #print (pid, ham.get_weight('Kiselev', [pid])) # weights.append(ham.get_weight('BGL', [pid])) #weights.append(ham.get_weight('BGL')) weights.append(ham.get_weight('Kiselev')) # print ("chekpoint E", i) if i > maxevents: break reduced_tree = tree_df[:len(weights)] reduced_tree['hammer'] = np.nan_to_num( np.array(weights)) # sone NaNs, check the manual to_root(reduced_tree, 'reweighed_bc_tree_mu_fromEfgtoKis_14Apr21.root', key='tree')
def main(data_path, gamma_path, corsika_path, config_template, output_base, threshold, theta2_cut, gamma_fraction, title, start, end, zd_min, zd_max): with h5py.File(data_path, 'r') as f: source_dependent = 'gamma_prediction_off_1' in f['events'].keys() if source_dependent: other_columns.extend(bg_prediction_columns) theta_cut = np.inf theta2_cut = np.inf print('Source dependent separation, ignoring theta cut') theta_cut = np.sqrt(theta2_cut) data = read_h5py(data_path, key='events', columns=data_columns + output_columns + other_columns) gammas = read_h5py( gamma_path, key='events', columns=mc_columns + output_columns + other_columns, ) gammas.rename( columns={'corsika_evt_header_total_energy': 'true_energy'}, inplace=True, ) runs = read_h5py(data_path, key='runs') data['timestamp'] = pd.to_datetime( data['unix_time_utc_0'] * 1e6 + data['unix_time_utc_1'], unit='us', ) if start: data = data.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end: data = data.query('timestamp <= @end') runs = runs.query('run_start <= @end') min_zenith = runs.zenith.min() max_zenith = runs.zenith.max() if zd_min: min_zenith = max(min_zenith, zd_min) if zd_max: max_zenith = min(max_zenith, zd_max) print('Zenith range of the input data:', min_zenith, max_zenith) if source_dependent: on_data, off_data = split_on_off_source_dependent(data, threshold) on_gammas = gammas.query('gamma_prediction >= {}'.format(threshold)) else: on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >= {}'.format(threshold)), theta2_cut=theta2_cut, ) on_gammas = gammas.query( '(theta_deg <= {}) & (gamma_prediction >= {})'.format( theta_cut, threshold, )) query = '(zd_tracking >= {}) and (zd_tracking <= {})'.format( min_zenith, max_zenith) on_gammas = on_gammas.query(query).copy() output_columns.append('theta_deg') on_gammas = on_gammas.loc[:, output_columns + ['true_energy']] on_data = on_data.loc[:, output_columns + data_columns] off_data = off_data.loc[:, output_columns + data_columns] off_data['weight'] = 0.2 on_data['weight'] = 1.0 on_gammas['weight'] = 1.0 rpd.to_root(on_data, output_base + '_on.root', key='events') rpd.to_root(off_data, output_base + '_off.root', key='events') rpd.to_root(on_gammas, output_base + '_mc.root', key='events') print('N_on: {}'.format(len(on_data))) print('N_off: {}'.format(len(off_data))) print('S(Li&Ma): {}'.format( li_ma_significance(len(on_data), len(off_data), 0.2))) print('N_mc: {}'.format(len(on_gammas))) n_excess = len(on_data) - 0.2 * len(off_data) fraction = n_excess / len(on_gammas) print('N_excess:', n_excess) print('Fraction: {:1.4f}'.format(fraction)) with open(config_template) as f: template = f.read() t_obs = runs.ontime.sum() try: corsika = pd.read_hdf(corsika_path, key='table') except KeyError: f = h5py.File(corsika_path) print("given key not in file: possible keys are: {}".format( list(f.keys()))) return corsika['zenith'] = np.rad2deg(corsika['zenith']) corsika = corsika.query('(zenith >= {}) and (zenith <= {})'.format( min_zenith, max_zenith)) print('Simulated events after zenith cut: {}'.format(len(corsika))) config = template.format( t_obs=t_obs, selection_fraction=gamma_fraction, n_gamma=len(corsika), source_file_on=output_base + '_on.root', source_file_off=output_base + '_off.root', source_file_mc=output_base + '_mc.root', tree_name='events', output_file=output_base + '_result.root', fraction=fraction, min_zenith=min_zenith, max_zenith=max_zenith, title=title, ) with open(output_base + '.config', 'w') as f: f.write(config)
#print name_fields #name_fields = np.append( name_fields, ['ggFVBF'] ) #name_fields = np.append( name_fields, ['NNScore'] ) #name_fields = np.append( name_fields, ['NNScore'] ) DF_test = pd.DataFrame(np.load(DF_path + 'ResultsTestPD.npy'), columns=name_fields) DF_train = pd.DataFrame(np.load(DF_path + 'ResultsTrainPD.npy'), columns=name_fields) #print DF_test #print DF_test.shape #print name_fields.shape rp.to_root(DF_test, 'NNFlatTree_TestSample.root', key='NNFlatTree') DF_test_VBF = DF_test[DF_test['ggFVBF'] == 1] DF_test_ggF = DF_test[DF_test['ggFVBF'] == 0] DF_train_VBF = DF_train[DF_train['ggFVBF'] == 1] DF_train_ggF = DF_train[DF_train['ggFVBF'] == 0] rp.to_root(DF_test_VBF, 'NNFlatTree_VBF1000.root', key='NNFlatTree') rp.to_root(DF_test_ggF, 'NNFlatTree_ggF1000.root', key='NNFlatTree') ### Vectorial Tree from Reader ### if runForVBFggF: VT_name = VT_path + 'VBF_H1000.root' DF_VT_VBF1000 = pd.DataFrame( root2array(VT_name, 'Nominal', branches=list_branches(VT_name)))
def main(): #get the observables from the MC root files if file_type == 'Signal_MU': fname = '/disk/lhcb_data/amathad/Lb2Lclnu_analysis/MC/Lb2Lcmunu_MagUp_2016_Combine.root' key = 'DecayTree' reco_truth_vars = [ 'Lb_True_Q2_mu', 'Lb_True_Costhetal_mu', 'q2_Pred', 'costhl_Pred' ] extra_sel_vars = [ 'isTruth', 'isFiducial', 'Event_LbProdcorr', 'Event_TrackCalibcorr', 'Event_PIDCalibEffWeight', 'Event_L0Muoncorr', 'isFullsel', 'runNumber', 'eventNumber' ] columns = reco_truth_vars + extra_sel_vars elif file_type == 'Signal_MD': fname = '/disk/lhcb_data/amathad/Lb2Lclnu_analysis/MC/Lb2Lcmunu_MagDown_2016_Combine.root' key = 'DecayTree' reco_truth_vars = [ 'Lb_True_Q2_mu', 'Lb_True_Costhetal_mu', 'q2_Pred', 'costhl_Pred' ] extra_sel_vars = [ 'isTruth', 'isFiducial', 'Event_LbProdcorr', 'Event_TrackCalibcorr', 'Event_PIDCalibEffWeight', 'Event_L0Muoncorr', 'isFullsel', 'runNumber', 'eventNumber' ] columns = reco_truth_vars + extra_sel_vars elif file_type == 'Gen': fname = '/home/hep/amathad/LbToLclnu_RunTwo/FittingScripts/qsq_cthl_spectra/Differential_density/responsematrix_eff/GeomEffFiles/LcMuNu_gen_new.root' key = 'DecayTree' columns = ['Lb_True_Costhetal_mu', 'Lb_True_Q2_mu', 'Event_LbProdcorr'] #get phsp array using the model.import_unbinned_data function (using pathrootfile as pathname input) df_phsp_arr = read_root(fname, key=key, columns=columns) #import the fit results file for PDF_OLD and make a dictionary with open('./MC_fitres.txt') as txt: data = txt.readlines() print(len(data), data) dict_params_pdf_old = {} for i in range(len(data)): dataline = data[i].split() print(dataline) if 'loglh' in str(dataline[0]): break else: dict_params_pdf_old[str(dataline[0])] = float(dataline[1]) print(dict_params_pdf_old) #fill with weights fill_weights(scenario, df_phsp_arr, dict_params_pdf_old, n_params=n_params) print(df_phsp_arr) #dump the file to root if conservative: f_new_name = './model_dependency_rootfiles_conservative/' + fname.split( '/')[-1] else: f_new_name = './model_dependency_rootfiles/' + fname.split('/')[-1] f_new_name = f_new_name.replace('.root', '_' + scenario + '_modeldependency.root') print(f_new_name) to_root(df_phsp_arr, f_new_name, key=key, store_index=False)
'mmm_p4', 'jpsiK_p4', 'pion_p4', 'jpsipi_p4', 'jpsi_p4', 'Bdir_eta', 'Bdir_phi', ] for k, v in samples.items(): for new_column, new_definition in to_define: if samples[k].HasColumn(new_column): continue samples[k] = samples[k].Define(new_column, new_definition) # convert to pandas samples[k] = pd.DataFrame(samples[k].AsNumpy(exclude=to_exclude)) for icolumn in to_cast: samples[k][icolumn] = samples[k][icolumn].astype(np.bool, copy=False) print('enrich the data', k) for i, label in zip(range(3), ['mu', 'tau', 'bkg']): samples[k]['bdt_%s' % label] = model.predict_proba( samples[k][features])[:, i] to_root(samples[k], '%s/BcToXToJpsi_is_%s_enriched.root' % (tree_dir, k), key='BTommm', store_index=False)
df['time'] = pd.to_datetime(df['created_at']) df['tbench'] = df['field1'].astype(float) df['hbench'] = df['field2'].astype(float) df['tchiller'] = df['field3'].astype(float) df['chillerstatus'] = df['field4'].astype(float) df['tlab'] = df['field5'].astype(float) df['hlab'] = df['field6'].astype(float) df.set_index('time', inplace=True) #set the index to the date column #convert time to Rome timezone #df.index=df.index.tz_localize('GMT') #df.index=df.index.tz_convert('Europe/Rome') #select only meaningful data df = df[df.index >= '2019-05-10'] #convert date to epoch df['timestamp'] = df.index.astype('int64') / 1000000000 #removes unnecessary colums df = df.loc[:, 'tbench':'timestamp'] print df.head(5) print "......." print df.tail(5) from root_pandas import to_root to_root(df, options.output, key='LYBenchTemp')