def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set
'^(\s+[^\s]+){2}(\s+(?P<peptide>[A-Z]+))(\s+[^\s]+){10}(\s+(?P<affinity>[0-9]{1,2}\.[0-9]+))' ) results = {} with open(output_location, 'r') as f: for line in f: match = regex.match(line) if match: peptide = match.group('peptide') affinity = float(match.group('affinity')) results[peptide] = affinity else: print('Could not match line: %s' % line) return results mzid_parser = mzid.DataFrame(args.mzid_file) netmhc_alleles = ['H-2-Kb'] """ First, extract the targets and decoys from the PIN file. """ target_peptides = set() decoy_peptides = set() with open(args.pin_file, 'r') as f: reader = csv.DictReader(f, delimiter='\t', restkey='Proteins') next(reader) for row in reader: label = row['Label'].strip() peptide = clean_peptide(row['Peptide']) if len(peptide) >= min_peptide_length and len(
def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) # if any(None in set(df1['protein_descr'].str[0])): # print('HERE') # df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] # try: # df1['expect'] = 1.0 / df1['bions_score_neg'].values # except: # pass df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix, decoy_infix=decoy_infix) if not df1.decoy.sum(): raise NoDecoyError() if decoy2set is None: decoy2set = split_decoys(df1) else: df1['decoy2'] = df1['protein'].apply( lambda p: all(x in decoy2set for x in p)) df1['decoy1'] = df1['decoy'] & (~df1['decoy2']) df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum() df1_f = filter_custom(df1[~df1['decoy1']], fdr=fdr, key='expect', is_decoy='decoy2', reverse=False, remove_decoy=False, ratio=pep_ratio, formula=1, correction=None, loglabel='PSMs default') num_psms_def = df1_f[~df1_f['decoy2']].shape[0] logger.info( 'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d', num_psms_def) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") retention_coefficients = achrom.get_RCs_vary_lcp( df1_f['peptide'].values, df1_f['RT exp'].values) df1_f['RT pred'] = df1_f['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) _, _, r_value, std_value = aux.linear_regression( df1_f['RT pred'], df1_f['RT exp']) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set
def read_mzid(path): return mzid.DataFrame(path)