def add_objectCollection(obj, kinem_name, collection_name, df): obj_array = [ obj(pt, eta, phi) for (pt, eta, phi) in zip( pup.flatten(df[kinem_name[0]]), pup.flatten(df[kinem_name[1]]), pup.flatten(df[kinem_name[2]])) ] objects = np.array(pup.match_shape(np.array(obj_array), df[kinem_name[0]])) df[collection_name] = objects return
def _scale(matrix_train, matrix_test): ''' Use scikit learn to scale features to 0 mean, 1 std. Because of event-level structure, we need to flatten X, scale, and then reshape back into event format. Args: matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing Returns: the same matrices after scaling ''' with warnings.catch_warnings(): warnings.simplefilter("ignore") ref_test = matrix_test[:, 0] ref_train = matrix_train[:, 0] for col in xrange(matrix_train.shape[1]): scaler = StandardScaler() matrix_train[:, col] = pup.match_shape( scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train) matrix_test[:, col] = pup.match_shape( scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test) return matrix_train, matrix_test
def add_objectCollection(obj, kinem_name, collection_name, df): """ This function add a row with an object obj (Jet, Muon or Electron) to the dataframe df. Since the current objects need pt,eta,phi (stored as vector in the current dataframe), corresponding string need to be passed. """ obj_array = [ obj(pt, eta, phi) for (pt, eta, phi) in zip( pup.flatten(df[kinem_name[0]]), pup.flatten(df[kinem_name[1]]), pup.flatten(df[kinem_name[2]])) ] objects = np.array(pup.match_shape(np.array(obj_array), df[kinem_name[0]])) df[collection_name] = objects return
def transformVars(df): ''' modifies the variables to create the ones that mv2 uses, inserts default values when needed, saves new variables in the dataframe Args: ----- df: pandas dataframe containing all the interesting variables as extracted from the .root file Returns: -------- modified mv2-compliant dataframe ''' from rootpy.vector import LorentzVector, Vector3 import pandautils as pup # -- modify features and set default values df['abs(jet_eta)'] = abs(df['jet_eta']) # -- create new IPxD features for (pu, pb, pc) in zip(df['jet_ip2d_pu'], df['jet_ip2d_pb'], df['jet_ip2d_pc']): pu[np.logical_or(pu >= 10, pu < -1)] = -1 pb[np.logical_or(pu >= 10, pu < -1)] = -1 pc[np.logical_or(pu >= 10, pu < -1)] = -1 for (pu, pb, pc) in zip(df['jet_ip3d_pu'], df['jet_ip3d_pb'], df['jet_ip3d_pc']): pu[pu >= 10] = -1 pb[pu >= 10] = -1 pc[pu >= 10] = -1 df['jet_ip2'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip2_c'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pc']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip2_cu'] = (df['jet_ip2d_pc'] / df['jet_ip2d_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip3'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip3_c'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pc']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip3_cu'] = (df['jet_ip3d_pc'] / df['jet_ip3d_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) # -- create new IPMP features for (pu, pb, pc) in zip(df['jet_ipmp_pu'], df['jet_ipmp_pb'], df['jet_ipmp_pc']): pu[pu >= 10] = -1 pb[pu >= 10] = -1 pc[pu >= 10] = -1 df['jet_ip'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip_c'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pc']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) df['jet_ip_cu'] = (df['jet_ipmp_pc'] / df['jet_ipmp_pu']).apply( lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20)) # -- SV1 features dx = df['jet_sv1_vtx_x'] - df['PVx'] dy = df['jet_sv1_vtx_y'] - df['PVy'] dz = df['jet_sv1_vtx_z'] - df['PVz'] v_jet = LorentzVector() pv2sv = Vector3() sv1_L3d = [] sv1_Lxy = [] dR = [] for index, dxi in enumerate(dx): # loop thru events sv1_L3d_ev = [] sv1L_ev = [] dR_ev = [] for jet in xrange(len(dxi)): # loop thru jets v_jet.SetPtEtaPhiM(df['jet_pt'][index][jet], df['jet_eta'][index][jet], df['jet_phi'][index][jet], df['jet_m'][index][jet]) if (dxi[jet].size != 0): sv1_L3d_ev.append( np.sqrt( pow(dx[index][jet], 2) + pow(dy[index][jet], 2) + pow(dz[index][jet], 2))[0]) sv1L_ev.append(math.hypot(dx[index][jet], dy[index][jet])) pv2sv.SetXYZ(dx[index][jet], dy[index][jet], dz[index][jet]) jetAxis = Vector3(v_jet.Px(), v_jet.Py(), v_jet.Pz()) dR_ev.append(pv2sv.DeltaR(jetAxis)) else: dR_ev.append(-1) sv1L_ev.append(-100) sv1_L3d_ev.append(-100) sv1_Lxy.append(sv1L_ev) dR.append(dR_ev) sv1_L3d.append(sv1_L3d_ev) df['jet_sv1_dR'] = dR df['jet_sv1_Lxy'] = sv1_Lxy df['jet_sv1_L3d'] = sv1_L3d # -- add more default values for sv1 variables sv1_vtx_ok = pup.match_shape( np.asarray([len(el) for event in df['jet_sv1_vtx_x'] for el in event]), df['jet_pt']) for (ok4event, sv1_ntkv4event, sv1_n2t4event, sv1_mass4event, sv1_efrc4event, sv1_sig34event) in zip(sv1_vtx_ok, df['jet_sv1_ntrkv'], df['jet_sv1_n2t'], df['jet_sv1_m'], df['jet_sv1_efc'], df['jet_sv1_sig3d']): sv1_ntkv4event[np.asarray(ok4event) == 0] = -1 sv1_n2t4event[np.asarray(ok4event) == 0] = -1 sv1_mass4event[np.asarray(ok4event) == 0] = -1000 sv1_efrc4event[np.asarray(ok4event) == 0] = -1 sv1_sig34event[np.asarray(ok4event) == 0] = -100 # -- JF features jf_dR = [] for eventN, (etas, phis, masses) in enumerate( zip(df['jet_jf_deta'], df['jet_jf_dphi'], df['jet_jf_m'])): # loop thru events jf_dR_ev = [] for m in xrange(len(masses)): # loop thru jets if (masses[m] > 0): jf_dR_ev.append(np.sqrt(etas[m] * etas[m] + phis[m] * phis[m])) else: jf_dR_ev.append(-10) jf_dR.append(jf_dR_ev) df['jet_jf_dR'] = jf_dR # -- add more default values for jf variables for (jf_mass, jf_n2tv, jf_ntrkv, jf_nvtx, jf_nvtx1t, jf_efrc, jf_sig3) in zip(df['jet_jf_m'], df['jet_jf_n2t'], df['jet_jf_ntrkAtVx'], df['jet_jf_nvtx'], df['jet_jf_nvtx1t'], df['jet_jf_efc'], df['jet_jf_sig3d']): jf_n2tv[jf_mass <= 0] = -1 jf_ntrkv[jf_mass <= 0] = -1 jf_nvtx[jf_mass <= 0] = -1 jf_nvtx1t[jf_mass <= 0] = -1 jf_mass[jf_mass <= 0] = -1e3 jf_efrc[jf_mass <= 0] = -1 jf_sig3[jf_mass <= 0] = -100 return df
def transformVars(df): ''' modifies the variables to create the ones that mv2 uses, inserts default values when needed, saves new variables in the dataframe Args: ----- df: pandas dataframe containing all the interesting variables as extracted from the .root file Returns: -------- modified mv2-compliant dataframe ''' from rootpy.vector import LorentzVector, Vector3 import pandautils as pup # -- modify features and set default values df['abs(jet_eta)'] = abs(df['jet_eta']) # -- create new IPxD features for (pu,pb,pc) in zip(df['jet_ip2d_pu'],df['jet_ip2d_pb'],df['jet_ip2d_pc']) : pu[np.logical_or(pu >= 10, pu <-1)] = -1 pb[np.logical_or(pu >= 10, pu <-1)] = -1 pc[np.logical_or(pu >= 10, pu <-1)] = -1 for (pu,pb,pc) in zip(df['jet_ip3d_pu'],df['jet_ip3d_pb'],df['jet_ip3d_pc']) : pu[pu >= 10] = -1 pb[pu >= 10] = -1 pc[pu >= 10] = -1 df['jet_ip2'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip2_c'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip2_cu'] = (df['jet_ip2d_pc'] / df['jet_ip2d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip3'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip3_c'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip3_cu'] = (df['jet_ip3d_pc'] / df['jet_ip3d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) # -- create new IPMP features for (pu,pb,pc) in zip(df['jet_ipmp_pu'],df['jet_ipmp_pb'],df['jet_ipmp_pc']) : pu[pu >= 10] = -1 pb[pu >= 10] = -1 pc[pu >= 10] = -1 df['jet_ip'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip_c'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) df['jet_ip_cu'] = (df['jet_ipmp_pc'] / df['jet_ipmp_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20)) # -- SV1 features dx = df['jet_sv1_vtx_x']-df['PVx'] dy = df['jet_sv1_vtx_y']-df['PVy'] dz = df['jet_sv1_vtx_z']-df['PVz'] v_jet = LorentzVector() pv2sv = Vector3() sv1_L3d = [] sv1_Lxy = [] dR = [] for index, dxi in enumerate(dx): # loop thru events sv1_L3d_ev = [] sv1L_ev = [] dR_ev = [] for jet in xrange(len(dxi)): # loop thru jets v_jet.SetPtEtaPhiM(df['jet_pt'][index][jet], df['jet_eta'][index][jet], df['jet_phi'][index][jet], df['jet_m'][index][jet]) if (dxi[jet].size != 0): sv1_L3d_ev.append(np.sqrt(pow(dx[index][jet], 2) + pow(dy[index][jet], 2) + pow(dz[index][jet], 2))[0]) sv1L_ev.append(math.hypot(dx[index][jet], dy[index][jet])) pv2sv.SetXYZ(dx[index][jet], dy[index][jet], dz[index][jet]) jetAxis = Vector3(v_jet.Px(), v_jet.Py(), v_jet.Pz()) dR_ev.append(pv2sv.DeltaR(jetAxis)) else: dR_ev.append(-1) sv1L_ev.append(-100) sv1_L3d_ev.append(-100) sv1_Lxy.append(sv1L_ev) dR.append(dR_ev) sv1_L3d.append(sv1_L3d_ev) df['jet_sv1_dR'] = dR df['jet_sv1_Lxy'] = sv1_Lxy df['jet_sv1_L3d'] = sv1_L3d # -- add more default values for sv1 variables sv1_vtx_ok = pup.match_shape(np.asarray([len(el) for event in df['jet_sv1_vtx_x'] for el in event]), df['jet_pt']) for (ok4event, sv1_ntkv4event, sv1_n2t4event, sv1_mass4event, sv1_efrc4event, sv1_sig34event) in zip(sv1_vtx_ok, df['jet_sv1_ntrkv'], df['jet_sv1_n2t'], df['jet_sv1_m'], df['jet_sv1_efc'], df['jet_sv1_sig3d']): sv1_ntkv4event[np.asarray(ok4event) == 0] = -1 sv1_n2t4event[np.asarray(ok4event) == 0] = -1 sv1_mass4event[np.asarray(ok4event) == 0] = -1000 sv1_efrc4event[np.asarray(ok4event) == 0] = -1 sv1_sig34event[np.asarray(ok4event) == 0] = -100 # -- JF features jf_dR = [] for eventN, (etas, phis, masses) in enumerate(zip(df['jet_jf_deta'], df['jet_jf_dphi'], df['jet_jf_m'])): # loop thru events jf_dR_ev = [] for m in xrange(len(masses)): # loop thru jets if (masses[m] > 0): jf_dR_ev.append(np.sqrt(etas[m] * etas[m] + phis[m] * phis[m])) else: jf_dR_ev.append(-10) jf_dR.append(jf_dR_ev) df['jet_jf_dR'] = jf_dR # -- add more default values for jf variables for (jf_mass,jf_n2tv,jf_ntrkv,jf_nvtx,jf_nvtx1t,jf_efrc,jf_sig3) in zip(df['jet_jf_m'],df['jet_jf_n2t'],df['jet_jf_ntrkAtVx'],df['jet_jf_nvtx'],df['jet_jf_nvtx1t'],df['jet_jf_efc'],df['jet_jf_sig3d']): jf_n2tv[jf_mass <= 0] = -1; jf_ntrkv[jf_mass <= 0] = -1; jf_nvtx[jf_mass <= 0] = -1; jf_nvtx1t[jf_mass <= 0]= -1; jf_mass[jf_mass <= 0] = -1e3; jf_efrc[jf_mass <= 0] = -1; jf_sig3[jf_mass <= 0] = -100; return df