def prepare_data(branchinfo_file="vars.yaml", region="2j2b"): with open(branchinfo_file, "r") as f: branches = yaml.load(f, Loader=yaml.FullLoader) branches = branches[region] ttbar = from_pytables(f"/home/ddavis/ATLAS/data/h5s/ttbar_r{region}.h5", label=0, auxlabel=1) tW_DR = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DR_r{region}.h5", label=1, auxlabel=1) tW_DS = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DS_r{region}.h5", label=1, auxlabel=0) tW_DR.keep_columns(branches) ttbar.keep_columns(branches) tW_DS.keep_columns(branches) scale_weight_sum(tW_DS, ttbar) scale_weight_sum(tW_DR, ttbar) tW_DR.weights *= 50 tW_DS.weights *= 50 ttbar.weights *= 100 X = pd.concat([ttbar.df, tW_DR.df, tW_DS.df]).to_numpy() w = np.concatenate([ttbar.weights, tW_DR.weights, tW_DS.weights]) y = np.concatenate( [ttbar.label_asarray, tW_DR.label_asarray, tW_DS.label_asarray]) z = np.concatenate([ ttbar.auxlabel_asarray, tW_DR.auxlabel_asarray, tW_DS.auxlabel_asarray ]) return (X, y, w, z)
def get_combined(): ttbar = from_pytables("ttbar.h5", "ttbar", label=0) tW_DR = from_pytables("tW_DR.h5", "tW_DR", label=0) tW_DS = from_pytables("tW_DS.h5", "tW_DS", label=1) scale_weight_sum(tW_DR, ttbar) scale_weight_sum(tW_DS, ttbar) tW_DR.weights *= 0.5 tW_DS.weights *= 0.5 print(ttbar.weights.sum(), tW_DR.weights.sum(), tW_DS.weights.sum()) return ttbar, tW_DR, tW_DS
def test_save_and_read(): ds.to_pytables("outfile.h5") new_ds = from_pytables("outfile.h5", ds.name) X1 = ds.df.to_numpy() X2 = new_ds.df.to_numpy() w1 = ds.weights w2 = new_ds.weights np.testing.assert_array_almost_equal(X1, X2, 6) np.testing.assert_array_almost_equal(w1, w2, 6)
def test_auxweights(): branches = ["pT_lep1", "pT_lep2", "eta_lep1", "eta_lep2"] ds1 = from_root( ["tests/data/test_file.root"], name="myds", branches=branches, auxweights=["phi_lep1", "phi_lep2"], ) ds2 = from_root( ["tests/data/test_file.root"], name="ds2", branches=branches, auxweights=["phi_lep1", "phi_lep2"], ) ds1.append(ds2) dsa = from_root( ["tests/data/test_file.root"], name="myds", branches=branches, auxweights=["phi_lep1", "phi_lep2"], ) dsb = from_root( ["tests/data/test_file.root"], name="ds2", branches=branches, auxweights=["phi_lep1", "phi_lep2"], ) dsc = dsa + dsb np.testing.assert_array_almost_equal(ds1.auxweights["phi_lep1"], dsc.auxweights["phi_lep1"], 5) dsc.change_weights("phi_lep2") assert dsc.weight_name == "phi_lep2" pl2 = uproot.open("tests/data/test_file.root")["WtLoop_nominal"].array( "phi_lep2") nw2 = uproot.open("tests/data/test_file.root")["WtLoop_nominal"].array( "weight_nominal") ds2.change_weights("phi_lep2") np.testing.assert_array_almost_equal(ds2.weights, pl2, 5) assert "phi_lep2" not in ds2.auxweights assert "weight_nominal" in ds2.auxweights ds2.to_pytables("outfile1.h5") ds2pt = from_pytables("outfile1.h5", "ds2", weight_name="phi_lep2") print(ds2pt.auxweights) np.testing.assert_array_almost_equal( ds2pt.auxweights["weight_nominal"].to_numpy(), nw2) os.remove("outfile1.h5") assert True
def prepare_data(region="3jHb", delete_datasets=True): ttbar = from_pytables( f"/Users/ddavis/Desktop/newfullkincomb/ttbar_r{region}.h5", label=0, auxlabel=1) tW_DR = from_pytables( f"/Users/ddavis/Desktop/newfullkincomb/tW_DR_r{region}.h5", label=1, auxlabel=1) # tW_DS = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DS_r{region}.h5", label=1, auxlabel=0) # tW_DR.keep_columns(branches) # ttbar.keep_columns(branches) # tW_DS.keep_columns(branches) # scale_weight_sum(tW_DS, ttbar) scale_weight_sum(tW_DR, ttbar) tW_DR.weights *= 100 # tW_DS.weights *= 50 ttbar.weights *= 100 X = pd.concat([ttbar.df, tW_DR.df]).to_numpy() # , tW_DS.df]).to_numpy() w = np.concatenate([ttbar.weights, tW_DR.weights]) # , tW_DS.weights]) y = np.concatenate([ttbar.label_asarray(), tW_DR.label_asarray()]) # , tW_DS.label_asarray()]) z = np.concatenate([ttbar.auxlabel_asarray(), tW_DR.auxlabel_asarray() ] # , tW_DS.auxlabel_asarray()] ) if delete_datasets: del ttbar del tW_DR print("returning data") return (X, y, w, z)
def __init__( self, name='zero_jet', base_directory='../h5', signal_h5='sig_one_jet.h5', signal_name='sig', signal_tree='wt_DR_nominal', signal_latex=r'H$\rightarrow\mu\mu$', backgd_h5='bkg_zero_jet.h5', backgd_name='bkg', backgd_tree='tt_nominal', backgd_latex=r'Data sideband', weight_name='weight', variables=['Z_PT_FSR', 'Z_Y_FSR', 'Muons_CosThetaStar', 'm_mumu'], has_syst=False, syssig_h5='tW_DS_2j2b.h5', syssig_name='tW_DS_2j2b', syssig_tree='tW_DS', syssig_latex=r'$tW$ DS', has_mass=True, reg_variable='m_mumu', reg_latex=r'm_\mu\mu', ): self.name = name self.signal_label, self.backgd_label, self.center_label, self.syssig_label = 1, 0, 1, 0 self.signal_latex, self.backgd_latex = signal_latex, backgd_latex self.signal = from_pytables(signal_h5, signal_name, tree_name=signal_tree, weight_name=weight_name, label=self.signal_label, auxlabel=self.center_label) self.backgd = from_pytables(backgd_h5, backgd_name, tree_name=backgd_tree, weight_name=weight_name, label=self.backgd_label, auxlabel=self.center_label) self.signal.keep_columns(variables) self.backgd.keep_columns(variables) self.has_syst = has_syst self.syssig_latex = None if not self.has_syst else syssig_latex self.losses_test = {'L_gen': [], 'L_dis': [], 'L_diff': []} self.losses_train = {'L_gen': [], 'L_dis': [], 'L_diff': []} self.has_mass = has_mass self.reg_variable = reg_variable self.reg_latex = reg_latex if self.has_syst: self.syssig = from_pytables(syssig_h5, syssig_name, tree_name=syssig_tree, weight_name=weight_name, label=self.signal_label, auxlabel=self.syssig_label) self.syssig.keep_columns(variables) # Append syssig to signal self.signal.append(self.syssig) # Equalise signal weights to background weights scale_weight_sum(self.signal, self.backgd) self.X_raw = np.concatenate( [self.signal.df.to_numpy(), self.backgd.df.to_numpy()]) scaler = StandardScaler() self.X = scaler.fit_transform(self.X_raw) self.y = np.concatenate( [self.signal.label_asarray(), self.backgd.label_asarray()]) self.z = np.concatenate( [self.signal.auxlabel_asarray(), self.backgd.auxlabel_asarray()]) if self.has_mass: signal = from_pytables(signal_h5, signal_name, tree_name=signal_tree, weight_name=weight_name, label=self.signal_label, auxlabel=self.center_label) backgd = from_pytables(backgd_h5, backgd_name, tree_name=backgd_tree, weight_name=weight_name, label=self.backgd_label, auxlabel=self.center_label) signal.keep_columns([reg_variable]) backgd.keep_columns([reg_variable]) def normalise(df): df[df > 200] = 200 return (df - 110) / 70. self.z = np.concatenate([ normalise(signal.df).to_numpy(), normalise(backgd.df).to_numpy() ]) self.w = np.concatenate([self.signal.weights, self.backgd.weights]) self.output_path = '/'.join([base_directory, self.describe()]) + '/' if not os.path.exists(self.output_path): os.makedirs(self.output_path) print('\033[92m[INFO]\033[0m', self.describe(), self.signal.df.__getitem__, self.backgd.df.__getitem__) print('\033[92m[INFO]\033[0m', '-' * 20) #store the content with open(self.output_path + self.name + '_event.pkl', 'wb') as pkl: pickle.dump(scaler, pkl) #store the content with open(self.output_path + self.name + '_event_py2.pkl', 'wb') as pkl: pickle.dump(scaler, pkl, protocol=2)
import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import roc_auc_score import xgboost as xgb from twaml.data import from_pytables from twaml.data import scale_weight_sum import matplotlib.pyplot as plt ttbar = from_pytables("ttbar_1j1b.h5", "ttbar", label=0) tW_DR = from_pytables("tW_DR_1j1b.h5", "tW_DR", label=1) sow = ttbar.weights.sum() + tW_DR.weights.sum() mwfl = sow * 0.01 scale_weight_sum(tW_DR, ttbar) y = np.concatenate([tW_DR.label_asarray, ttbar.label_asarray]) X = np.concatenate([tW_DR.df.to_numpy(), ttbar.df.to_numpy()]) w = np.concatenate([tW_DR.weights, ttbar.weights]) folder = KFold(n_splits=3, shuffle=True, random_state=414) ttbar_dist = [] tW_dist = [] tW_w_dist = [] ttbar_w_dist = [] roc_aucs = [] for train_idx, test_idx in folder.split(X): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] w_train, w_test = w[train_idx], w[test_idx] param = {"max_depth": 4, "n_estimators": 150, "min_child_weight": mwfl}