def get_RC(self): try: seqs = [pept.modified_sequence for pept in self.peptideslist] RTexp = self.RT_exp#[pept.RT_exp for pept in self.peptideslist] RC_def = achrom.RCs_gilar_rp RC_def['aa'].setdefault('U', RC_def['aa'].get('C', 0.0)) RC_def['aa'].setdefault('O', RC_def['aa'].get('K', 0.0)) aa_labels = set(RC_def['aa'].keys()) for pept in self.peptideslist: for v in pept.modification_list.itervalues(): aa_labels.add(v) xdict = {} for key, val in RC_def['aa'].items(): xdict[key] = [val, None] RC_dict = achrom.get_RCs_vary_lcp(seqs, RTexp, labels=aa_labels) for key, val in RC_dict['aa'].items(): try: xdict[key][1] = val except: xdict[key] = [None, val] a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if all(v != None for v in x)], [x[1] for x in xdict.values() if all(v != None for v in x)]) for key, x in xdict.items(): if x[1] == None: x[1] = x[0] * a + b RC_dict['aa'][key] = x[1] if 'C' not in RC_dict['aa']: RC_dict['aa']['C'] = RC_dict['aa']['C*'] except: logger.error('Error in get_RC for achrom model. Using RCs_gilar_rp') RC_dict = achrom.RCs_gilar_rp self.RC = RC_dict
def get_calibrate_coeff(self): peptides = [] peptides_added = {} for peptide, RT_exp, RT_predicted in izip(self.peptideslist, self.RT_exp, self.RT_predicted): if peptide.sequence not in peptides_added: peptides_added[peptide.sequence] = [RT_exp, ] peptides.append([RT_predicted, RT_exp]) else: if any(abs(RT_exp - v) < 2 for v in peptides_added[peptide.sequence]): pass else: peptides_added[peptide.sequence].append(RT_exp) peptides.append([RT_predicted, RT_exp]) aux_RT = aux.linear_regression([val[0] for val in peptides], [val[1] for val in peptides]) return aux_RT
def rt_filtering(results, settings): settings = settings.copy() if settings.has_option('misc', 'legend'): legend = settings.get('misc', 'legend') else: legend = None RTexp, seqs = zip(*[(utils.get_RT(res['spectrum']), res['candidates'][0][1]) for res in results]) if legend is not None: stdl = set(parser.std_labels) newseqs = [] for s in seqs: if parser.fast_valid(s): newseqs.append(list(s)) else: seq = [] c, n = False, False for c in s: if c in stdl: seq.append(c) else: mod, res, term = legend[c] if res == '-': if term == '[': seq.append(mod + '-') n = True else: seq.append('-' + mod) c = True else: seq.append(mod + res) if not n: seq.append(parser.std_nterm) if not c: seq.append(parser.std_cterm) newseqs.append(seq) seqs = newseqs RTexp = [float(x) for x in RTexp] if np.allclose(RTexp, 0): logger.warning('RT is missing. Skipping RT optimization.') return settings RC_def = achrom.RCs_gilar_rp xdict = {} for key, val in RC_def['aa'].items(): xdict[key] = [val, None] RC_dict = utils.get_RCs_vary_lcp(seqs, RTexp) RC_dict_new = dict() for key, val in RC_dict['aa'].items(): xdict.setdefault(key, [val, None])[1] = val a, b, _, _ = aux.linear_regression( [x[0] for x in xdict.values() if x[1] != None], [x[1] for x in xdict.values() if x[1] != None]) for key, x in xdict.items(): if x[1] == None: x[1] = x[0] * a + b RC_dict_new[key] = x[1] if legend is not None: for k, v in legend.items(): if len(k) == 1: continue if k[-1] in '[]': if k[-2] == '-': kk = ('-' + k[1:-1]) if k[-1] == ']' else (k[:-1]) else: kk = k[:-1] elif len(k) > 1: kk = k logger.debug('%s -> %s', k, kk) if kk in RC_dict_new: RC_dict_new[v] = RC_dict_new[kk] else: if kk[-1].isupper(): kkk = kk[-1] elif kk[-1] == '-': kkk = parser.std_nterm elif kk[0] == '-': kkk = parser.std_cterm RC_dict_new[v] = RC_dict_new.get(kkk, 0) logger.info('No RC for %s, using %s or 0: %s', kk, kkk, RC_dict_new[v]) RC_dict['aa'] = RC_dict_new logger.debug('RC dict: %s', RC_dict) rtexp = np.array([np.mean(x) for x in RTexp]) rttheor = np.array( [calculate_RT(pep, RC_dict, raise_no_mod=False) for pep in seqs]) deltaRT = rtexp - rttheor logger.debug('Linear regression: %s', aux.linear_regression(rtexp, rttheor)) best_RT_l = scoreatpercentile(deltaRT, 0.05) best_RT_r = scoreatpercentile(deltaRT, 99.95) def condition(spectrum, cand, _, stored_value=False): if not stored_value: stored_value = calculate_RT(cand, RC_dict) rtd = spectrum['RT'] - stored_value return best_RT_l <= rtd <= best_RT_r, stored_value settings.set('scoring', 'condition', condition) return settings
plt.figure() plt.hist([peptide['m/z'] for peptide in peptides], bins=2000, range=(0, 4000)) plt.xlabel('m/z, Th') plt.ylabel('# of peptides within 2 Th bin') plt.figure() plt.hist([peptide['charge'] for peptide in peptides], bins=20, range=(0, 10)) plt.xlabel('charge, e') plt.ylabel('# of peptides') x = [peptide['RT_RP'] for peptide in peptides] y = [peptide['RT_normal'] for peptide in peptides] heatmap, xbins, ybins = np.histogram2d(x, y, bins=100) heatmap[heatmap == 0] = np.nan a, b, r, stderr = auxiliary.linear_regression(x, y) plt.figure() plt.imshow(heatmap) plt.xlabel('RT on RP, min') plt.ylabel('RT on normal phase, min') plt.title('All tryptic peptides, RT correlation = {0}'.format(r)) x = [peptide['m/z'] for peptide in peptides] y = [peptide['RT_RP'] for peptide in peptides] heatmap, xbins, ybins = np.histogram2d(x, y, bins=[150, 2000], range=[[0, 4000], [0, 150]]) heatmap[heatmap == 0] = np.nan a, b, r, stderr = auxiliary.linear_regression(x, y)
def prepare_dataframe(infile_path, decoy_prefix=None, decoy_infix=False, cleavage_rule=False, fdr=0.01, decoy2set=None): if not cleavage_rule: cleavage_rule = parser.expasy_rules['trypsin'] if infile_path.lower().endswith( '.pep.xml') or infile_path.lower().endswith('.pepxml'): df1 = pepxml.DataFrame(infile_path) ftype = 'pepxml' elif infile_path.lower().endswith('.mzid'): df1 = mzid.DataFrame(infile_path) else: raise WrongInputError() if not df1.shape[0]: raise EmptyFileError() if 'Morpheus Score' in df1.columns: df1 = df1[df1['Morpheus Score'] != 0] df1['expect'] = 1 / df1['Morpheus Score'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) if 'MS-GF:EValue' in df1.columns: # MSGF search engine ftype = 'msgf' df1['peptide'] = df1['PeptideSequence'] df1['num_missed_cleavages'] = df1['peptide'].apply( lambda x: parser.num_sites(x, rule=cleavage_rule)) df1['assumed_charge'] = df1['chargeState'] df1['spectrum'] = df1['spectrumID'] df1['massdiff'] = ( df1['experimentalMassToCharge'] - df1['calculatedMassToCharge']) * df1['assumed_charge'] df1['calc_neutral_pep_mass'] = df1['calculatedMassToCharge'] * df1[ 'chargeState'] - df1['chargeState'] * 1.00727649 df1['protein'] = df1['accession'] df1['protein_descr'] = df1['protein description'] df1['expect'] = df1['MS-GF:EValue'] if set(df1['protein_descr'].str[0]) == {None}: # MSFragger logger.debug('Adapting MSFragger DataFrame.') logger.debug('Proteins before: %s', df1.loc[1, 'protein']) protein = df1['protein'].apply( lambda row: [x.split(None, 1) for x in row]) df1['protein'] = protein.apply(lambda row: [x[0] for x in row]) try: df1['protein_descr'] = protein.apply( lambda row: [x[1] for x in row]) except IndexError: df1['protein_descr'] = protein.apply(lambda row: ['' for x in row]) logger.debug('Proteins after: %s', df1.loc[1, 'protein']) # if any(None in set(df1['protein_descr'].str[0])): # print('HERE') # df1['protein_descr'] = df1.apply(lambda x: x['protein_descr'] if x['protein_descr'] else x['protein'], axis=1) df1.loc[pd.isna(df1['protein_descr']), 'protein_descr'] = df1.loc[pd.isna(df1['protein_descr']), 'protein'] # try: # df1['expect'] = 1.0 / df1['bions_score_neg'].values # except: # pass df1 = df1[~pd.isna(df1['peptide'])] if 'MS1Intensity' not in df1: df1['MS1Intensity'] = 0.0 df1['length'] = df1['peptide'].apply(len) df1 = df1[df1['length'] >= 6] df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINS')[0]) if 'retention_time_sec' not in df1.columns: if 'scan start time' in df1.columns: df1['RT exp'] = df1['scan start time'] df1 = df1.drop([ 'scan start time', ], axis=1) else: df1['RT exp'] = 0 else: df1['RT exp'] = df1['retention_time_sec'] / 60 df1 = df1.drop([ 'retention_time_sec', ], axis=1) df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) df1['massdiff_ppm'] = 1e6 * (df1['massdiff'] - df1['massdiff_int'] * 1.003354) / df1['calc_neutral_pep_mass'] df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix, decoy_infix=decoy_infix) if not df1.decoy.sum(): raise NoDecoyError() if decoy2set is None: decoy2set = split_decoys(df1) else: df1['decoy2'] = df1['protein'].apply( lambda p: all(x in decoy2set for x in p)) df1['decoy1'] = df1['decoy'] & (~df1['decoy2']) df1 = remove_column_hit_rank(df1) if ftype == 'pepxml': df1['mods_counter'] = df1.apply(parse_mods, axis=1) elif ftype == 'msgf': df1['mods_counter'] = df1.apply(parse_mods_msgf, axis=1) prepare_mods(df1) pep_ratio = df1['decoy2'].sum() / df1['decoy'].sum() df1_f = filter_custom(df1[~df1['decoy1']], fdr=fdr, key='expect', is_decoy='decoy2', reverse=False, remove_decoy=False, ratio=pep_ratio, formula=1, correction=None, loglabel='PSMs default') num_psms_def = df1_f[~df1_f['decoy2']].shape[0] logger.info( 'Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d', num_psms_def) try: logger.info('Calibrating retention model...') with warnings.catch_warnings(): warnings.simplefilter("ignore") retention_coefficients = achrom.get_RCs_vary_lcp( df1_f['peptide'].values, df1_f['RT exp'].values) df1_f['RT pred'] = df1_f['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, retention_coefficients)) _, _, r_value, std_value = aux.linear_regression( df1_f['RT pred'], df1_f['RT exp']) logger.info('RT model training results: R^2 = %f , std = %f', r_value**2, std_value) df1['RT diff'] = df1['RT pred'] - df1['RT exp'] logger.info('Retention model calibrated successfully.') except Exception: logger.warning('Retention times are probably missing in input file.') df1['RT pred'] = df1['peptide'].apply( lambda x: calc_RT(x, achrom.RCs_krokhin_100A_tfa)) df1['RT diff'] = df1['RT exp'] return df1, decoy2set
def test_linear_regression_no_y_arr(self): x = np.array(list(zip(self.x, self.y))) result = aux.linear_regression(x) self._test_linreg(result)
def test_linear_regression_no_y_list(self): x = list(zip(self.x, self.y)) result = aux.linear_regression(x) self._test_linreg(result)
def test_linear_regression_simple(self): result = aux.linear_regression(self.x, self.y) self._test_linreg(result)