def phsp_goofit(flat_ltime=False): import root_pandas # path = 'root://eoslhcb.cern.ch//eos/lhcb/user/d/dmuller/K3Pi/RS_with_weight.root' path = 'root://eoslhcb.cern.ch//eos/lhcb/user/d/dmuller/K3Pi/phsp_mc.root' df = root_pandas.read_root(path, 'events', stop=15000000) df.rename(columns={ 'c12': vars.cos1(), 'c34': vars.cos2(), 'dtime': vars.ltime(mode_config.D0), 'phi': vars.phi1(), 'm12': vars.m12(), 'm34': vars.m34() }, inplace=True) df[vars.m12()] = df[vars.m12()] * 1000. df[vars.m34()] = df[vars.m34()] * 1000. if flat_ltime: df['D0_Loki_BPVLTIME'] = np.random.uniform(0.0001725, 0.00326, size=df.index.size) else: df['D0_Loki_BPVLTIME'] = two_parts_generate(turn=0.55, size=df.index.size) / 1000. return df
def _ltime_ratio(df): mode = gcm() ret = df[vars.ltime(mode.D0)] / config.Dz_ltime if is_dummy_run(df): return 1 return pd.Series(ret, name='ltime_ratio', index=df.index)
def get_model_ws_alt(redo=False): files = [ '/afs/cern.ch/user/c/chasse/public/forDominik/Sig_49_61_WSNR_Smaller.root' ] # NOQA bcolz_folder = config.bcolz_locations.format('generated_model_ws_alt') if redo: try: shutil.rmtree(bcolz_folder) except: pass helpers.allow_root() import root_pandas df = root_pandas.read_root(files, 'events') # Now rename stuff and fix units to MeV and ns. # Ugly hardcoded for now. df.rename(columns={ 'c12': vars.cos1(), 'c34': vars.cos2(), 'dtime': vars.ltime(mode_config.D0), 'phi': vars.phi1(), 'm12': vars.m12(), 'm34': vars.m34() }, inplace=True) df[vars.m12()] = df[vars.m12()] * 1000. df[vars.m34()] = df[vars.m34()] * 1000. df[vars.ltime(mode_config.D0)] = df[vars.ltime(mode_config.D0)] / 1000. df = df.query('{} > 0.0001725'.format(vars.ltime(mode_config.D0))) df = df.query('{} < 0.003256'.format(vars.ltime(mode_config.D0))) bcolz.ctable.fromdataframe(df, rootdir=bcolz_folder) return df else: bc = bcolz.open(bcolz_folder) return bc.todataframe() return df
def get_model(redo=False): files = filelists.Generated.paths bcolz_folder = config.bcolz_locations.format('generated_model') if redo: try: shutil.rmtree(bcolz_folder) except: pass helpers.allow_root() import root_pandas df = root_pandas.read_root(files, 'events') # Now rename stuff and fix units to MeV and ns. # Ugly hardcoded for now. df.rename(columns={ 'c12': vars.cos1(), 'c34': vars.cos2(), 'dtime': vars.ltime(mode_config.D0), 'phi': vars.phi1(), 'm12': vars.m12(), 'm34': vars.m34() }, inplace=True) df[vars.m12()] = df[vars.m12()] * 1000. df[vars.m34()] = df[vars.m34()] * 1000. df[vars.ltime(mode_config.D0)] = df[vars.ltime(mode_config.D0)] / 1000. df = df.query('{} > 0.0001725'.format(vars.ltime(mode_config.D0))) df = df.query('{} < 0.003256'.format(vars.ltime(mode_config.D0))) bcolz.ctable.fromdataframe(df, rootdir=bcolz_folder) return df else: bc = bcolz.open(bcolz_folder) return bc.todataframe() return df
def phsp_goofit_alt(): import root_pandas path = 'root://eoslhcb.cern.ch//eos/lhcb/user/d/dmuller/K3Pi/RS_with_weight_dtime.root' df = root_pandas.read_root(path, 'events') df.rename(columns={ 'c12': vars.cos1(), 'c34': vars.cos2(), 'dtime': vars.ltime(mode_config.D0), 'phi': vars.phi1(), 'm12': vars.m12(), 'm34': vars.m34() }, inplace=True) df[vars.m12()] = df[vars.m12()] * 1000. df[vars.m34()] = df[vars.m34()] * 1000. df['D0_Loki_BPVLTIME'] = df['D0_Loki_BPVLTIME'] / 1000. return df
def get(mode): """Get the preselection ROOT information from the mode :mode: TODO :returns: TODO """ _cuts = [] # _cuts += ['fabs(' + # m(mode.D0) + # ' - {}) < 60.'.format(config.PDG_MASSES[config.Dz])] _cuts += [build_step_cuts(ipchi2, mode.D0.all_daughters(), [4, 4, 4, 4])] for daug in mode.head.all_daughters(): _cuts += [p(daug) + ' >= 3000.'] _cuts += [p(daug) + ' < 100000.'] _cuts += [dtf_chi2(mode.head) + ' > 0.'] _cuts += [vdchi2(mode.D0) + ' > 0.'] _cuts += [maxdoca(mode.D0) + ' > 0.'] _cuts += [mindoca(mode.D0) + ' > 0.'] _cuts += [ltime(mode.D0) + ' > -10000.'] for daug in mode.head.all_daughters(): _cuts += [p(daug) + ' >= 3000.'] _cuts += [p(daug) + ' < 100000.'] for kaon in mode.head.all_pid(config.kaon): if mode.mc is None: _cuts += [probnnk(kaon) + ' > 0.3'] _cuts += [probnnpi(kaon) + ' < 0.7'] for pion in mode.head.all_pid(config.pion): if mode.mc is None: _cuts += [probnnpi(pion) + ' > 0.3'] _cuts += [probnnk(pion) + ' < 0.7'] for pion in mode.head.all_pid(config.slowpion): _cuts += [probnnghost(mode.Pislow) + ' < 0.3'] if mode.mc is None: _cuts += [probnnpi(mode.Pislow) + ' > 0.3'] _cuts += [probnnk(mode.Pislow) + ' < 0.7'] if mode.mode in config.twotag_modes: _cuts += [pt(mode.D0) + ' >= 1800.'] else: _cuts += [pt(mode.D0) + ' >= 4000.'] _cuts += ['TMath::Log(' + ipchi2(mode.D0) + ') < 1.'] return ' && '.join(['({})'.format(x) for x in _cuts])
def download(modename, polarity, year, full, test=False, mc=None, njobs=1): import root_pandas log.info('Getting data for {} {} {}'.format( modename, polarity, year)) mode = get_mode(polarity, year, modename, mc) # I accidentally forgot the p in Dstp. Got to rename everything now for # this one exception. Hack incoming if modename == 'WS' and year == 2016: # As this is the start, hack name of the particle in the mode. mode.Dstp.name = 'Dst' sel = get_root_preselection.get(mode) # Always download the entire MC if full != 1 and mc is None: ctr = int(1./float(full)) sel = '({} % {} == 0) && '.format(evt_num(), ctr) + sel log.info('Using ({} % {} == 0)'.format(evt_num(), ctr)) tempfile.mktemp('.root') input_files = mode.get_file_list() if test: input_files = input_files[:4] chunked = list(helpers.chunks(input_files, 25)) length = len(list(chunked)) # While the code is in developement, just get any variables we can # access for part in mode.head.all_mothers() + mode.head.all_daughters(): for func in variables.__all__: try: getattr(variables, func)(part) except variables.AccessorUsage: pass # Make some sorted variables. Saves the hassle when later training BDTs arg_sorted_ip = '{},{},{},{}'.format( *[ipchi2(p) for p in mode.D0.all_daughters()]) arg_sorted_pt = '{},{},{},{}'.format( *[pt(p) for p in mode.D0.all_daughters()]) add_vars = { 'delta_m': '{} - {}'.format(m(mode.Dstp), m(mode.D0)), 'delta_m_dtf': '{} - {}'.format(dtf_m(mode.Dstp), dtf_m(mode.D0)), 'ltime_ratio': '{} / {}'.format(ltime(mode.D0), config.Dz_ltime), 'ipchi2_1': 'ROOTex::Leading({})'.format(arg_sorted_ip), 'ipchi2_2': 'ROOTex::SecondLeading({})'.format(arg_sorted_ip), 'ipchi2_3': 'ROOTex::ThirdLeading({})'.format(arg_sorted_ip), 'ipchi2_4': 'ROOTex::FourthLeading({})'.format(arg_sorted_ip), 'pt_1': 'ROOTex::Leading({})'.format(arg_sorted_pt), 'pt_2': 'ROOTex::SecondLeading({})'.format(arg_sorted_pt), 'pt_3': 'ROOTex::ThirdLeading({})'.format(arg_sorted_pt), 'pt_4': 'ROOTex::FourthLeading({})'.format(arg_sorted_pt), } variables_needed = list(variables.all_ever_used) if mc == 'mc': variables_needed.append('Dstp_BKGCAT') def run_splitter(fns): temp_file = tempfile.mktemp('.root') treesplitter(files=fns, treename=mode.get_tree_name(), output=temp_file, variables=variables_needed, selection=sel, addvariables=add_vars) return temp_file pool = ProcessingPool(njobs) temp_files = [] for r in tqdm.tqdm(pool.uimap(run_splitter, chunked), leave=True, total=length, smoothing=0): temp_files.append(r) log.info('Created {} temporary files.'.format(len(temp_files))) bcolz_folder = config.bcolz_locations.format(mode.get_store_name()) try: log.info('Removing already existing data at {}'.format( bcolz_folder)) shutil.rmtree(bcolz_folder) except OSError: log.info('No previous data found. Nothing to delete.') df_gen = root_pandas.read_root(temp_files, mode.get_tree_name(), chunksize=[500000, 100][args.test]) # New storage using bcolz because better ctuple = None for df in df_gen: log.info('Adding {} events of {} to store {}.'.format( len(df), mode.get_tree_name(), bcolz_folder)) if modename == 'WS' and year == 2016: new_names = { old: old.replace('Dst', 'Dstp') for old in df.columns if 'Dst' in old } df = df.rename(index=str, columns=new_names) if ctuple is None: ctuple = bcolz.ctable.fromdataframe(df, rootdir=bcolz_folder) else: ctuple.append(df.to_records(index=False)) for f in temp_files: os.remove(f) # Loop and delete everything in the datastore that needs to be recached remove_buffer_for_mode(mode.mode) if modename == 'WS' and year == 2016: # As this is the start, hack name of the particle in the mode. mode.Dstp.name = 'Dstp'
def train_bdts(sw=False, comb_bkg=False): log.info('Training BDTs for {} {} {}'.format(gcm().mode, gcm().polarity, gcm().year)) (train, test, train_lbl, test_lbl), features, spectators = bdt_data.prep_data_for_sklearn( sw=sw, same_weight=True, comb_data=comb_bkg) # NOQA uniform_features = [vars.ltime(gcm().D0)] n_estimators = 400 classifiers = {} log.info('Configuring classifiers') min_samples = 2000 if sw else 10 if comb_bkg: lrate = 0.1 else: lrate = 0.1 base_ada = GradientBoostingClassifier(max_depth=3, n_estimators=n_estimators, learning_rate=lrate, min_samples_leaf=min_samples, loss='exponential') classifiers['Exponential'] = base_ada flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1, max_groups=2000, n_neighbours=300) ugbFL = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=3, n_estimators=n_estimators, learning_rate=lrate, train_features=features, min_samples_leaf=min_samples) classifiers['KnnFlatness'] = ugbFL binflatnessloss = ugb.BinFlatnessLossFunction(uniform_features, fl_coefficient=3., power=2.0, uniform_label=1, n_bins=15) ugbBFL = ugb.UGradientBoostingClassifier(loss=binflatnessloss, max_depth=3, n_estimators=n_estimators, learning_rate=lrate, train_features=features, min_samples_leaf=min_samples) classifiers['BinFlatness'] = ugbBFL log.info('Fitting classifiers') classifiers['Exponential'].fit(train[features], train_lbl, sample_weight=train.weights) classifiers['KnnFlatness'].fit(train[features + uniform_features], train_lbl, sample_weight=train.weights) classifiers['BinFlatness'].fit(train[features + uniform_features], train_lbl, sample_weight=train.weights) log.info('Pickling the thing') bdt_utils.dump_classifiers(classifiers, comb_bkg=comb_bkg) buffer.remove_buffer_for_function(get_bdt_discriminant)
def d0_lifetime_permille(df): ret = df[ltime(gcm().D0)] > 0.0001725 ret &= df[ltime(gcm().D0)] < 0.00326 return ret