def read_pepxml(fname, params_dict): ''' Reads pepxml file and preprocess it. Parameters ---------- fname: str Path to pepxml file params_dict: dict Dict with all input parameters Returns ------- DataFrame ''' logger.debug('Reading %s', fname) df = pepxml.DataFrame(fname, read_schema=False, columns=operator.itemgetter( 'peptides_column', 'proteins_column', 'spectrum_column', 'mass_shifts_column', 'charge_column')(params_dict)) return preprocess_df(df, fname, params_dict)
def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set
def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) # if any(None in set(df1['protein_descr'].str[0])): # print('HERE') # df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] # try: # df1['expect'] = 1.0 / df1['bions_score_neg'].values # except: # pass df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix, decoy_infix=decoy_infix) if not df1.decoy.sum(): raise NoDecoyError() if decoy2set is None: decoy2set = split_decoys(df1) else: df1['decoy2'] = df1['protein'].apply( lambda p: all(x in decoy2set for x in p)) df1['decoy1'] = df1['decoy'] & (~df1['decoy2']) df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum() df1_f = filter_custom(df1[~df1['decoy1']], fdr=fdr, key='expect', is_decoy='decoy2', reverse=False, remove_decoy=False, ratio=pep_ratio, formula=1, correction=None, loglabel='PSMs default') num_psms_def = df1_f[~df1_f['decoy2']].shape[0] logger.info( 'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d', num_psms_def) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") retention_coefficients = achrom.get_RCs_vary_lcp( df1_f['peptide'].values, df1_f['RT exp'].values) df1_f['RT pred'] = df1_f['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) _, _, r_value, std_value = aux.linear_regression( df1_f['RT pred'], df1_f['RT exp']) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set
def read_pepxml(fname, params_dict): return pepxml.DataFrame(fname, read_schema=False)
# converter of protxml files to csv INCLUDING details from StPeter from pyteomics import protxml, pepxml from math import nan import pandas as pd pp = protxml.read("StPeterOut.prot.xml") pepp = pepxml.read("Sample.pep.xml") pdata = protxml.DataFrame(pp) pepdata = pepxml.DataFrame(pepp) pepdata["Quantification_SI"] = nan def extract_stpeter(analysis): if isinstance(analysis, list): a = analysis[0] if (len(a) == 2): b = a.get("StPeterQuant") SI = b.get("SI") SIn = b.get("SIn") peps = b.get("StPeterQuant_peptide") peptable = pd.DataFrame(peps) pepseqs = peptable.get("sequence").to_string() # adding SI values to peptide table for index, r in peptable.iterrows(): pepdata.at[pepdata["modified_peptide"] == r.get("sequence"), "Quantification_SI"] = r.get("SI") pepSI = peptable.get("SI").to_string() return ({ "analysis": a.get("analysis"), "SI": SI,