def create_psm_lookup(fn, header, proteins, pgdb, shiftrows, unroll, specfncol, fastadelim, genefield): """Reads PSMs from file, stores them to a database backend in chunked PSMs. """ mzmlmap = pgdb.get_mzmlfile_map() sequences = {} for psm in tsvreader.generate_split_tsv_lines(fn, header): seq = tsvreader.get_psm_sequence(psm, unroll) sequences[seq] = 1 pepseqmap = pgdb.get_peptide_seq_map() pgdb.store_pepseqs(((seq, ) for seq in sequences if seq not in pepseqmap)) pepseqmap = pgdb.get_peptide_seq_map() psms = [] for row, psm in enumerate(tsvreader.generate_split_tsv_lines(fn, header)): row += shiftrows specfn, psm_id, specscanid, seq, score = tsvreader.get_psm( psm, unroll, specfncol) if len(psms) % DB_STORE_CHUNK == 0: pgdb.store_psms(psms) psms = [] psms.append({ 'rownr': row, 'psm_id': psm_id, 'seq': pepseqmap[seq], 'score': score, 'specfn': mzmlmap[specfn], 'spec_id': '{}_{}'.format(mzmlmap[specfn], specscanid), }) pgdb.store_psms(psms) pgdb.index_psms() store_psm_protein_relations(fn, header, pgdb, proteins, specfncol)
def get_quant(self, theader, features): if self.precursor: tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader) self.header.append(prottabledata.HEADER_AREA) features = proteins.add_ms1_quant_from_top3_mzidtsv( features, tpeps, self.headeraccfield, self.fixedfeatcol) if self.quantcolpattern: psmheader = tsvreader.get_tsv_header(self.psmfile) denomcols = False if self.denomcols is not None: denomcols = [ self.number_to_headerfield(col, psmheader) for col in self.denomcols ] elif self.denompatterns is not None: denomcolnrs = [ tsvreader.get_columns_by_pattern(psmheader, pattern) for pattern in self.denompatterns ] denomcols = set([col for cols in denomcolnrs for col in cols]) elif not self.mediansweep and not self.medianintensity: print( 'Must define either denominator column numbers ' 'or regex pattterns to find them, or use median sweep, or ' 'report median intensities.') sys.exit(1) elif self.medianintensity and self.mediannormalize: print( 'Cannot do median-centering on intensity values, exiting') sys.exit(1) quantcols = tsvreader.get_columns_by_pattern( psmheader, self.quantcolpattern) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines( self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] self.header = self.header + quantcols + nopsms + [ prottabledata.HEADER_NO_FULLQ_PSMS ] features = isosummarize.get_isobaric_ratios( self.psmfile, psmheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, features, self.headeraccfield, self.fixedfeatcol, False, False, False, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms) return features
def set_features(self): denomcols = False if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) elif not self.mediansweep and not self.medianintensity: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsvreader.get_columns_by_pattern(self.oldheader, self.quantcolpattern) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] if self.featcol: self.get_column_header_for_number(['featcol'], self.oldheader) self.header = [self.featcol] + quantcols + nopsms + [HEADER_NO_FULLQ_PSMS] else: self.header = (self.oldheader + ['ratio_{}'.format(x) for x in quantcols]) self.psms = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, False, False, self.featcol, False, False, False, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms)
def generate_peptides(tsvfn, oldheader, switch_map, scorecol, precurquantcol, fncol=None, higherbetter=True): if fncol is None: fncol = mzidtsvdata.HEADER_SPECFILE peptides = {} for psm in reader.generate_split_tsv_lines(tsvfn, oldheader): for oldkey, newkey in switch_map.items(): try: psm[newkey] = psm.pop(oldkey) except KeyError: pass pepseq = psm[peptabledata.HEADER_PEPTIDE] peptides = evaluate_peptide(peptides, psm, pepseq, higherbetter, scorecol, fncol) add_quant_values(peptides, psm, precurquantcol) for peptide in peptides.values(): peptide['line'][peptabledata.HEADER_LINKED_PSMS] = '; '.join( peptide['psms']) for qtype, pepquant in peptide['quant'].items(): peptide['line'].update(parse_quant_data(qtype, pepquant)) yield peptide['line']
def store_psm_protein_relations(fn, header, pgdb, proteins, specfncol): """Reads PSMs from file, extracts their proteins and peptides and passes them to a database backend in chunks. """ # TODO do we need an OrderedDict or is regular dict enough? # Sorting for psm_id useful? allpsms = OrderedDict() last_id, psmids_to_store = None, set() store_soon = False for psm in tsvreader.generate_split_tsv_lines(fn, header): psm_id, prots = tsvreader.get_pepproteins(psm, specfncol) # TODO can this be removed permanently? # Filter proteins to only include those that match the protein # accessions in fasta so we get the correct names, filter out the badly annotated peptides # prots = [x for x in prots if x in proteins] try: # In case the PSMs are presented unrolled allpsms[psm_id].extend(prots) except KeyError: allpsms[psm_id] = prots if len(psmids_to_store) > DB_STORE_CHUNK: store_soon = True if store_soon and last_id != psm_id: pgdb.store_peptides_proteins(allpsms, psmids_to_store) store_soon = False psmids_to_store = set() psmids_to_store.add(psm_id) last_id = psm_id if len(psmids_to_store) > 0: pgdb.store_peptides_proteins(allpsms, psmids_to_store) pgdb.index_protein_peptides() return allpsms
def paste_to_psmtable(psmfn, header, ratios): # loop psms in psmtable, paste the outratios in memory for psm, ratio in zip(reader.generate_split_tsv_lines(psmfn, header), ratios): ratio.pop(ISOQUANTRATIO_FEAT_ACC) ratio = {'ratio_{}'.format(ch): val for ch, val in ratio.items()} psm.update(ratio) yield psm
def prepare(self): if type(self.fn) == list: self.first_infile = self.fn[0] else: self.first_infile = self.fn self.oldheader = tsvreader.get_tsv_header(self.first_infile) self.oldpsms = tsvreader.generate_split_tsv_lines( self.fn, self.oldheader)
def get_td_proteins_bestpep(self, theader, dheader): self.header = [self.headeraccfield] + prottabledata.PICKED_HEADER tscorecol = tsvreader.get_cols_in_file(self.scorecolpattern, theader, True) dscorecol = tsvreader.get_cols_in_file(self.scorecolpattern, dheader, True) tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader) dpeps = tsvreader.generate_split_tsv_lines(self.decoyfn, dheader) targets = proteins.generate_bestpep_proteins(tpeps, tscorecol, self.minlogscore, self.headeraccfield, self.fixedfeatcol) decoys = proteins.generate_bestpep_proteins(dpeps, dscorecol, self.minlogscore, self.headeraccfield, self.fixedfeatcol) return targets, decoys
def store_proteins_descriptions(pgdb, fastafn, fastamd5, tsvfn, header, fastadelim, genefield): if not fastafn: prots = {} for psm in tsvreader.generate_split_tsv_lines(tsvfn, header): prots.update({x: 1 for x in tsvreader.get_proteins_from_psm(psm)}) prots = [(protein, ) for protein in prots.keys()] pgdb.store_proteins(prots) else: prots, seqs, desc, evids, ensgs, symbols = fastareader.get_proteins_for_db( fastafn, fastadelim, genefield) pgdb.store_fasta(fastafn, fastamd5, prots, evids, seqs, desc, ensgs, symbols) return set([x[0] for x in prots])
def get_psmratios(psmfn, header, channels, denom_channels, sweep, report_intensity, summarize_by, min_int, acc_col, logintensities, keep_na_psms): allfeats, feat_order, psmratios = {}, OrderedDict(), [] for psm in reader.generate_split_tsv_lines(psmfn, header): # remove uninformative psms when adding to features # TODO the check for is-not-a-peptide can be removed but there are some usecases # for which it is convenient, when adding information to the peptide # sequence (e.g. PTM data). When having fully functional PTM # data storage/analysis in msstitch, we can possibly remove it if acc_col and ( psm[acc_col] == '' or (acc_col != psmh.HEADER_PEPTIDE and ';' in psm[acc_col]) or not {psm[q] for q in channels}.difference({'NA', None, False, ''})): continue ratios = calc_psm_ratios_or_int(psm, channels, denom_channels, sweep, report_intensity, min_int, logintensities) if acc_col and not keep_na_psms and any( (ratios[ix] == 'NA' for ix, q in enumerate(channels))): continue elif acc_col: try: allfeats[psm[acc_col]].append(ratios) except KeyError: allfeats[psm[acc_col]] = [ratios] feat_order[psm[acc_col]] = 1 else: psmquant = { ch: str(ratios[ix]) if ratios[ix] != 'NA' else 'NA' for ix, ch in enumerate(channels) } psmquant[ISOQUANTRATIO_FEAT_ACC] = False psmratios.append(psmquant) if not acc_col: return psmratios else: outfeatures = [] for feat in feat_order.keys(): quants = allfeats[feat] outfeature = {ISOQUANTRATIO_FEAT_ACC: feat} if summarize_by == 'median': outfeature.update(get_medians(channels, quants)) elif summarize_by == 'average': outfeature.update(summarize_by_averages(channels, quants)) outfeature.update(get_no_psms(channels, quants)) outfeatures.append(outfeature) return outfeatures
def write(self): for psmfn, mzidfn in zip(self.fn, self.mzidfns): oldheader = tsvreader.get_tsv_header(psmfn) header = perco.get_header_with_percolator(oldheader) outfn = self.create_outfilepath(psmfn, self.outsuffix) mzns = mzidreader.get_mzid_namespace(mzidfn) mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns) psms = tsvreader.generate_split_tsv_lines(psmfn, oldheader) psms_perco = perco.add_fdr_to_mzidtsv(psms, mzidsr, mzns, self.percopsms) if self.filtpsm: psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PSMQ, self.filtpsm, True) if self.filtpep: psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PEPTIDE_Q, self.filtpep, True) writer.write_tsv(header, psms_perco, outfn)
def set_features(self): """Creates iterator to write to new tsv. Contains input tsv lines plus quant data for these.""" # First prepare the data, read PSM table to SQLite specfncolnr = int(self.spectracol) - 1 specfncol = self.oldheader[specfncolnr] fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim, self.genefield) if self.fasta: fasta_md5 = refine.get_fasta_md5(self.fasta) else: fasta_md5 = False # If appending to previously refined PSM table, reuse DB and shift rows if self.oldpsmfile: oldfasta_md5 = self.lookup.get_fasta_md5() if fasta_md5 != oldfasta_md5: print('WARNING, FASTA database used in old PSM table differs ' 'from the passed database (or this cannot be determined ' 'due to version differences), this may cause problems, as ' 'msstitch will use the old database for PSM annotation.') shiftrows = self.lookup.get_highest_rownr() + 1 proteins = set([x for x in self.lookup.get_protids()]) self.lookup.drop_psm_indices() else: shiftrows = 0 if self.proteingroup: if not fasta_md5 and not oldfasta_md5: # In case of old Fasta already stored it will be fine to protein group print('Cannot create protein group without supplying FASTA search ' 'database file') sys.exit(1) self.tabletypes.append('proteingroup') self.lookup.drop_pgroup_tables() self.lookup.add_tables(self.tabletypes) # Need to place this here since we cannot store before having done add tables, but that # has to be done after getting proteingroup knowledge, which depends on knowledge of # having passed an oldpsmfile (because of oldfasta_md5): if not self.oldpsmfile: proteins = refine.store_proteins_descriptions(self.lookup, self.fasta, fasta_md5, self.fn, self.oldheader, fastadelim, genefield) refine.create_psm_lookup(self.fn, self.oldheader, proteins, self.lookup, shiftrows, self.unroll, specfncol, fastadelim, genefield) isob_header = [x[0] for x in self.lookup.get_all_quantmaps()] if self.isobaric else False self.header = refine.create_header(self.oldheader, self.genes, self.proteingroup, self.precursor, isob_header, self.addbioset, self.addmiscleav, specfncolnr) psms = self.oldpsms # Now pass PSMs through multiple generators to add info if self.genes: psms = refine.add_genes_to_psm_table(psms, self.lookup) if self.isobaric or self.precursor: psms = refine.generate_psms_quanted(self.lookup, shiftrows, psms, isob_header, self.isobaric, self.precursor, self.min_purity) psms = refine.generate_psms_spectradata(self.lookup, shiftrows, psms, self.addbioset, self.addmiscleav) if self.oldpsmfile: prevheader = tsvreader.get_tsv_header(self.oldpsmfile) previouspsms = tsvreader.generate_split_tsv_lines(self.oldpsmfile, prevheader) psms = chain(previouspsms, psms) # Enforce proteingroup last, since it has to come AFTER the chaining of old + new PSMs # In theory you could do it before, but that makes no sense since in a big experiment you # also do not map PSMs to genes differently? If that is needed, you have to run multiple # experiments. if self.proteingroup: refine.build_proteingroup_db(self.lookup) psms = refine.generate_psms_with_proteingroups(psms, self.lookup, specfncol, self.unroll) self.psms = psms
def set_features(self): qpat = self.quantcolpattern if self.quantcolpattern else '[a-z]+[0-9]+plex_' header = [x for x in self.oldheader if x != psmh.HEADER_SPECFILE] try: isocols = tsvreader.get_columns_by_pattern(header, qpat) except RuntimeError: pass else: for col in isocols: header.pop(header.index(col)) if self.precurquantcol: header = [peph.HEADER_AREA if x == self.precurquantcol else x for x in header] header = [peph.HEADER_PEPTIDE, peph.HEADER_LINKED_PSMS] + [ x for x in header if x != psmh.HEADER_PEPTIDE] switch_map = {old: new for old, new in zip( [psmh.HEADER_PEPTIDE, psmh.HEADER_PROTEIN, psmh.HEADER_PEPTIDE_Q], [peph.HEADER_PEPTIDE, peph.HEADER_PROTEINS, peph.HEADER_QVAL])} self.header = [switch_map[field] if field in switch_map else field for field in header] peptides = psmtopeptable.generate_peptides(self.fn, self.oldheader, switch_map, self.scorecol, self.precurquantcol, self.spectracol) # Remove quant data if not specified any way to summarize if self.quantcolpattern and any([self.denomcols, self.denompatterns, self.mediansweep, self.medianintensity]): denomcols = False if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) quantcols = tsvreader.get_columns_by_pattern(self.oldheader, self.quantcolpattern) totalproteome, tpacc, tp_pepacc = False, False, False if self.totalprotfn: pep_tp_accs = [psmh.HEADER_MASTER_PROT, psmh.HEADER_SYMBOL, psmh.HEADER_GENE, peph.HEADER_PROTEINS] totalphead = tsvreader.get_tsv_header(self.totalprotfn) totalpfield_found = False for tpacc, tp_pepacc in zip(proth.TPROT_HEADER_ACCS, pep_tp_accs): if totalphead[0] == tpacc and tp_pepacc in self.header: totalpfield_found = True break if not totalpfield_found: print('Could not find correct header field name in the total ' 'proteome table passed. ' 'Should be one of {}'.format(proth.TPROT_HEADER_ACCS)) sys.exit(1) totalproteome = tsvreader.generate_split_tsv_lines(self.totalprotfn, totalphead) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] self.header = self.header + quantcols + nopsms + [proth.HEADER_NO_FULLQ_PSMS] peptides = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, peptides, self.header[0], psmh.HEADER_PEPTIDE, totalproteome, tpacc, tp_pepacc, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms) if self.modelqvals: qix = self.header.index(peph.HEADER_QVAL) + 1 self.header = self.header[:qix] + [peph.HEADER_QVAL_MODELED] + self.header[qix:] scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) peptides = psmtopeptable.recalculate_qvals_linear_model(peptides, scorecol, self.qvalthreshold, self.minpeptidenr) self.features = peptides