예제 #1
0
def add_objectCollection(obj, kinem_name, collection_name, df):
    obj_array = [
        obj(pt, eta, phi) for (pt, eta, phi) in zip(
            pup.flatten(df[kinem_name[0]]), pup.flatten(df[kinem_name[1]]),
            pup.flatten(df[kinem_name[2]]))
    ]
    objects = np.array(pup.match_shape(np.array(obj_array), df[kinem_name[0]]))
    df[collection_name] = objects
    return
def _scale(matrix_train, matrix_test):
    '''
    Use scikit learn to scale features to 0 mean, 1 std. 
    Because of event-level structure, we need to flatten X, scale, and then reshape back into event format.
    Args:
        matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training
        matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing
    Returns:
        the same matrices after scaling
    '''
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        ref_test = matrix_test[:, 0]
        ref_train = matrix_train[:, 0]
        for col in xrange(matrix_train.shape[1]):
            scaler = StandardScaler()
            matrix_train[:, col] = pup.match_shape(
                scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train)
            matrix_test[:, col] = pup.match_shape(
                scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test)

    return matrix_train, matrix_test
예제 #3
0
def add_objectCollection(obj, kinem_name, collection_name, df):
    """
    This function add a row with an object obj (Jet, Muon or Electron)
    to the dataframe df. Since the current objects need pt,eta,phi
    (stored as vector in the current dataframe), corresponding string
    need to be passed.
    """
    obj_array = [
        obj(pt, eta, phi) for (pt, eta, phi) in zip(
            pup.flatten(df[kinem_name[0]]), pup.flatten(df[kinem_name[1]]),
            pup.flatten(df[kinem_name[2]]))
    ]
    objects = np.array(pup.match_shape(np.array(obj_array), df[kinem_name[0]]))
    df[collection_name] = objects
    return
def transformVars(df):
    '''
    modifies the variables to create the ones that mv2 uses, inserts default values when needed, saves new variables
    in the dataframe
    Args:
    -----
        df: pandas dataframe containing all the interesting variables as extracted from the .root file
    Returns:
    --------
        modified mv2-compliant dataframe
    '''
    from rootpy.vector import LorentzVector, Vector3
    import pandautils as pup

    # -- modify features and set default values
    df['abs(jet_eta)'] = abs(df['jet_eta'])

    # -- create new IPxD features
    for (pu, pb, pc) in zip(df['jet_ip2d_pu'], df['jet_ip2d_pb'],
                            df['jet_ip2d_pc']):
        pu[np.logical_or(pu >= 10, pu < -1)] = -1
        pb[np.logical_or(pu >= 10, pu < -1)] = -1
        pc[np.logical_or(pu >= 10, pu < -1)] = -1
    for (pu, pb, pc) in zip(df['jet_ip3d_pu'], df['jet_ip3d_pb'],
                            df['jet_ip3d_pc']):
        pu[pu >= 10] = -1
        pb[pu >= 10] = -1
        pc[pu >= 10] = -1
    df['jet_ip2'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip2_c'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pc']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip2_cu'] = (df['jet_ip2d_pc'] / df['jet_ip2d_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip3'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip3_c'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pc']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip3_cu'] = (df['jet_ip3d_pc'] / df['jet_ip3d_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))

    # -- create new IPMP features
    for (pu, pb, pc) in zip(df['jet_ipmp_pu'], df['jet_ipmp_pb'],
                            df['jet_ipmp_pc']):
        pu[pu >= 10] = -1
        pb[pu >= 10] = -1
        pc[pu >= 10] = -1
    df['jet_ip'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip_c'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pc']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))
    df['jet_ip_cu'] = (df['jet_ipmp_pc'] / df['jet_ipmp_pu']).apply(
        lambda x: np.log(x)).apply(lambda x: _replaceInfNaN(x, -20))

    # -- SV1 features
    dx = df['jet_sv1_vtx_x'] - df['PVx']
    dy = df['jet_sv1_vtx_y'] - df['PVy']
    dz = df['jet_sv1_vtx_z'] - df['PVz']

    v_jet = LorentzVector()
    pv2sv = Vector3()
    sv1_L3d = []
    sv1_Lxy = []
    dR = []

    for index, dxi in enumerate(dx):  # loop thru events
        sv1_L3d_ev = []
        sv1L_ev = []
        dR_ev = []
        for jet in xrange(len(dxi)):  # loop thru jets
            v_jet.SetPtEtaPhiM(df['jet_pt'][index][jet],
                               df['jet_eta'][index][jet],
                               df['jet_phi'][index][jet],
                               df['jet_m'][index][jet])
            if (dxi[jet].size != 0):
                sv1_L3d_ev.append(
                    np.sqrt(
                        pow(dx[index][jet], 2) + pow(dy[index][jet], 2) +
                        pow(dz[index][jet], 2))[0])
                sv1L_ev.append(math.hypot(dx[index][jet], dy[index][jet]))

                pv2sv.SetXYZ(dx[index][jet], dy[index][jet], dz[index][jet])
                jetAxis = Vector3(v_jet.Px(), v_jet.Py(), v_jet.Pz())
                dR_ev.append(pv2sv.DeltaR(jetAxis))
            else:
                dR_ev.append(-1)
                sv1L_ev.append(-100)
                sv1_L3d_ev.append(-100)

        sv1_Lxy.append(sv1L_ev)
        dR.append(dR_ev)
        sv1_L3d.append(sv1_L3d_ev)

    df['jet_sv1_dR'] = dR
    df['jet_sv1_Lxy'] = sv1_Lxy
    df['jet_sv1_L3d'] = sv1_L3d

    # -- add more default values for sv1 variables
    sv1_vtx_ok = pup.match_shape(
        np.asarray([len(el) for event in df['jet_sv1_vtx_x'] for el in event]),
        df['jet_pt'])

    for (ok4event, sv1_ntkv4event, sv1_n2t4event, sv1_mass4event,
         sv1_efrc4event,
         sv1_sig34event) in zip(sv1_vtx_ok, df['jet_sv1_ntrkv'],
                                df['jet_sv1_n2t'], df['jet_sv1_m'],
                                df['jet_sv1_efc'], df['jet_sv1_sig3d']):
        sv1_ntkv4event[np.asarray(ok4event) == 0] = -1
        sv1_n2t4event[np.asarray(ok4event) == 0] = -1
        sv1_mass4event[np.asarray(ok4event) == 0] = -1000
        sv1_efrc4event[np.asarray(ok4event) == 0] = -1
        sv1_sig34event[np.asarray(ok4event) == 0] = -100

    # -- JF features
    jf_dR = []
    for eventN, (etas, phis, masses) in enumerate(
            zip(df['jet_jf_deta'], df['jet_jf_dphi'],
                df['jet_jf_m'])):  # loop thru events
        jf_dR_ev = []
        for m in xrange(len(masses)):  # loop thru jets
            if (masses[m] > 0):
                jf_dR_ev.append(np.sqrt(etas[m] * etas[m] + phis[m] * phis[m]))
            else:
                jf_dR_ev.append(-10)
        jf_dR.append(jf_dR_ev)
    df['jet_jf_dR'] = jf_dR

    # -- add more default values for jf variables
    for (jf_mass, jf_n2tv, jf_ntrkv, jf_nvtx, jf_nvtx1t, jf_efrc,
         jf_sig3) in zip(df['jet_jf_m'], df['jet_jf_n2t'],
                         df['jet_jf_ntrkAtVx'], df['jet_jf_nvtx'],
                         df['jet_jf_nvtx1t'], df['jet_jf_efc'],
                         df['jet_jf_sig3d']):
        jf_n2tv[jf_mass <= 0] = -1
        jf_ntrkv[jf_mass <= 0] = -1
        jf_nvtx[jf_mass <= 0] = -1
        jf_nvtx1t[jf_mass <= 0] = -1
        jf_mass[jf_mass <= 0] = -1e3
        jf_efrc[jf_mass <= 0] = -1
        jf_sig3[jf_mass <= 0] = -100

    return df
def transformVars(df):
    '''
    modifies the variables to create the ones that mv2 uses, inserts default values when needed, saves new variables
    in the dataframe
    Args:
    -----
        df: pandas dataframe containing all the interesting variables as extracted from the .root file
    Returns:
    --------
        modified mv2-compliant dataframe
    '''
    from rootpy.vector import LorentzVector, Vector3
    import pandautils as pup

    # -- modify features and set default values
    df['abs(jet_eta)'] = abs(df['jet_eta'])

    # -- create new IPxD features
    for (pu,pb,pc) in zip(df['jet_ip2d_pu'],df['jet_ip2d_pb'],df['jet_ip2d_pc']) :
        pu[np.logical_or(pu >= 10, pu <-1)] = -1
        pb[np.logical_or(pu >= 10, pu <-1)] = -1
        pc[np.logical_or(pu >= 10, pu <-1)] = -1
    for (pu,pb,pc) in zip(df['jet_ip3d_pu'],df['jet_ip3d_pb'],df['jet_ip3d_pc']) :
        pu[pu >= 10] = -1
        pb[pu >= 10] = -1
        pc[pu >= 10] = -1       
    df['jet_ip2'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip2_c'] = (df['jet_ip2d_pb'] / df['jet_ip2d_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip2_cu'] = (df['jet_ip2d_pc'] / df['jet_ip2d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip3'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip3_c'] = (df['jet_ip3d_pb'] / df['jet_ip3d_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip3_cu'] = (df['jet_ip3d_pc'] / df['jet_ip3d_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    
    # -- create new IPMP features
    for (pu,pb,pc) in zip(df['jet_ipmp_pu'],df['jet_ipmp_pb'],df['jet_ipmp_pc']) :
        pu[pu >= 10] = -1
        pb[pu >= 10] = -1
        pc[pu >= 10] = -1 
    df['jet_ip'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip_c'] = (df['jet_ipmp_pb'] / df['jet_ipmp_pc']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))
    df['jet_ip_cu'] = (df['jet_ipmp_pc'] / df['jet_ipmp_pu']).apply(lambda x : np.log( x )).apply(lambda x: replaceInfNaN(x, -20))

    # -- SV1 features
    dx = df['jet_sv1_vtx_x']-df['PVx']
    dy = df['jet_sv1_vtx_y']-df['PVy']
    dz = df['jet_sv1_vtx_z']-df['PVz']

    v_jet = LorentzVector()
    pv2sv = Vector3()
    sv1_L3d = []
    sv1_Lxy = []
    dR = [] 

    for index, dxi in enumerate(dx): # loop thru events
        sv1_L3d_ev = []
        sv1L_ev = []
        dR_ev = []
        for jet in xrange(len(dxi)): # loop thru jets
            v_jet.SetPtEtaPhiM(df['jet_pt'][index][jet], df['jet_eta'][index][jet], df['jet_phi'][index][jet], df['jet_m'][index][jet])
            if (dxi[jet].size != 0):
                sv1_L3d_ev.append(np.sqrt(pow(dx[index][jet], 2) + pow(dy[index][jet], 2) + pow(dz[index][jet], 2))[0])
                sv1L_ev.append(math.hypot(dx[index][jet], dy[index][jet]))
                
                pv2sv.SetXYZ(dx[index][jet], dy[index][jet], dz[index][jet])
                jetAxis = Vector3(v_jet.Px(), v_jet.Py(), v_jet.Pz())
                dR_ev.append(pv2sv.DeltaR(jetAxis))
            else: 
                dR_ev.append(-1)   
                sv1L_ev.append(-100)
                sv1_L3d_ev.append(-100)
             
        sv1_Lxy.append(sv1L_ev)
        dR.append(dR_ev) 
        sv1_L3d.append(sv1_L3d_ev)
        
    df['jet_sv1_dR'] = dR 
    df['jet_sv1_Lxy'] = sv1_Lxy
    df['jet_sv1_L3d'] = sv1_L3d

    # -- add more default values for sv1 variables
    sv1_vtx_ok = pup.match_shape(np.asarray([len(el) for event in df['jet_sv1_vtx_x'] for el in event]), df['jet_pt'])

    for (ok4event, sv1_ntkv4event, sv1_n2t4event, sv1_mass4event, sv1_efrc4event, sv1_sig34event) in zip(sv1_vtx_ok, df['jet_sv1_ntrkv'], df['jet_sv1_n2t'], df['jet_sv1_m'], df['jet_sv1_efc'], df['jet_sv1_sig3d']): 
        sv1_ntkv4event[np.asarray(ok4event) == 0] = -1
        sv1_n2t4event[np.asarray(ok4event) == 0] = -1 
        sv1_mass4event[np.asarray(ok4event) == 0] = -1000
        sv1_efrc4event[np.asarray(ok4event) == 0] = -1 
        sv1_sig34event[np.asarray(ok4event) == 0] = -100

    # -- JF features
    jf_dR = []
    for eventN, (etas, phis, masses) in enumerate(zip(df['jet_jf_deta'], df['jet_jf_dphi'], df['jet_jf_m'])): # loop thru events
        jf_dR_ev = []
        for m in xrange(len(masses)): # loop thru jets
            if (masses[m] > 0):
                jf_dR_ev.append(np.sqrt(etas[m] * etas[m] + phis[m] * phis[m]))
            else:
                jf_dR_ev.append(-10)
        jf_dR.append(jf_dR_ev)
    df['jet_jf_dR'] = jf_dR

    # -- add more default values for jf variables
    for (jf_mass,jf_n2tv,jf_ntrkv,jf_nvtx,jf_nvtx1t,jf_efrc,jf_sig3) in zip(df['jet_jf_m'],df['jet_jf_n2t'],df['jet_jf_ntrkAtVx'],df['jet_jf_nvtx'],df['jet_jf_nvtx1t'],df['jet_jf_efc'],df['jet_jf_sig3d']):
        jf_n2tv[jf_mass <= 0] = -1;
        jf_ntrkv[jf_mass <= 0] = -1;
        jf_nvtx[jf_mass <= 0]  = -1;
        jf_nvtx1t[jf_mass <= 0]= -1;
        jf_mass[jf_mass <= 0]  = -1e3;
        jf_efrc[jf_mass <= 0]  = -1;
        jf_sig3[jf_mass <= 0]  = -100;

    return df