def loadFromFiles(self, event): self.set_status("Opening data files...", 0) datafiles = [self.files[self.fileDisplay.GetItemText(x, 1)] for x in range(0, self.fileDisplay.GetItemCount())] self.dataPtrs = {} for datafile in datafiles: basedata = os.path.basename(datafile) if self.curves: self.dataPtrs[basedata] = mzFileMapped(datafile, self.curves[basedata]) else: self.dataPtrs[basedata] = mzFileMapped(datafile) self.set_status("Loading PSMs...", 0) resultfiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], self.files[self.fileDisplay.GetItemText(x, 2)]) for x in range(0, self.fileDisplay.GetItemCount())] self.psms = {} for datafile, resultfile in resultfiles: #if datafile not in self.psms or self.psms[datafile][0] != resultfile: psms = list(reader(resultfile)) self.psms[os.path.basename(datafile)] = resultfile, psms featurefiles = [(self.files[self.fileDisplay.GetItemText(x, 1)], self.files[self.fileDisplay.GetItemText(x, 3)]) for x in range(0, self.fileDisplay.GetItemCount())] self.features = {} for datafile, featurefile in featurefiles: #if datafile not in self.features or self.features[datafile[0]] != featurefile: featureDB = FeatureInterface(featurefile) self.features[os.path.basename(datafile)] = featureDB self.set_status("Loading MS1 info...", 0) self.ms1s = dict([(x, [s for s in data.scan_info(0, 9999999) if s[3] == 'MS1']) for x, data in self.dataPtrs.items()]) self.set_status("Collecting peptides...", 0) self.proteins = defaultdict(list) for datafile, (resultfile, psms) in self.psms.items(): for psm in psms: psm['Datafile'] = datafile byProtein = collectByCriterion(psms, lambda x: x['Accession Number']) for acc, psms in byProtein.items(): self.proteins[acc] += psms for acc, psms in self.proteins.items(): collected = collectByCriterion(psms, lambda x: renderPeptideTag((x['Peptide Sequence'], x['Variable Modifications'], x['Charge']))) self.proteins[acc] = collected self.set_status("...", 0) if event: self.render(None)
def openResultFile(self, event): self.set_status("Opening PSM file...", 0) reportfile = self.resultCtrl.GetValue() #self.psms = collectByCriterion(list(reader(reportfile)), lambda x: (x['Peptide Sequence'], #x['Variable Modifications'], #x['Charge'])) results = list(reader(reportfile)) proteinLabel = 'gene_symbol' if 'gene_symbol' in results[ 0] else 'Accession Number' #proteins = collectByCriterion(results, lambda x: x[proteinLabel]) proteins = defaultdict(list) for psm in results: accessions = [x.strip() for x in psm[proteinLabel].split(';')] for accession in accessions: proteins[accession].append(psm) self.psms = {} for protein, psms in proteins.items(): peptides = collectByCriterion( psms, lambda x: '|'.join([ x['Peptide Sequence'], x['Variable Modifications'], str(x['Charge']) ])) self.psms[protein] = peptides self.updatePSMDisplay(None)
def combine_peptides(reportfile, isobaric=None, outputfile=None): from multiplierz.mzReport import reader, writer from multiplierz.mgf import standard_title_parse isobaric_labels = { None: [], 4: ['114', '115', '116', '117'], 6: ['126', '127', '128', '129', '130', '131'], 8: ['113', '114', '115', '116', '117', '118', '119', '121'], 10: [ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ] } def _byPeptide(row): # Not counting charge. varmodset = frozenset( [x.strip() for x in row['Variable Modifications'].split(';')]) return row['Peptide Sequence'], varmodset def _getReporters(row): attrib = standard_title_parse(row['Spectrum Description']) return [float(attrib[x.lower()]) for x in isobaric_labels[isobaric]] assert isobaric in isobaric_labels psms = reader(reportfile) rowsByPeptide = collectByCriterion(psms, _byPeptide) sum_cols = ['Sum%s' % x for x in isobaric_labels[isobaric]] top_cols = ['Max%s' % x for x in isobaric_labels[isobaric]] if not outputfile: outputfile = insert_tag(reportfile, 'peptide_combined') output = writer(outputfile, columns=(psms.columns + sum_cols + top_cols + ['PSMs'])) for pep, psms in rowsByPeptide.items(): outrow = max(psms, key=lambda x: x['Peptide Score']) outrow['PSMs'] = len(psms) if isobaric: repsets = [_getReporters(x) for x in psms] toprepset = max(repsets, key=lambda x: sum(x)) sumrepset = [sum(x) for x in zip(*repsets)] for rep, col in zip(toprepset, top_cols): outrow[col] = rep for rep, col in zip(sumrepset, sum_cols): outrow[col] = rep output.write(outrow) output.close() return outputfile
def site_form_mod_names(self): from multiplierz.internalAlgorithms import collectByCriterion command = """SELECT name, site, specset FROM modsites""" self.cur.execute(command) specmoddata = self.cur.fetchall() specmodsets = collectByCriterion(specmoddata, lambda x: (x[0], x[2])) modnames = [] for specmodset in list(specmodsets.values()): modname = specmodset[0][0] sites = ''.join(sorted(set(zip(*specmodset)[1]))) modnames.append('%s (%s)' % (modname, sites)) return sorted(modnames)
def psm_intersection(directory, mode_subdirs): """ To give a more accurate depiction of the relative elution profile of each label state, the final results will only consider peptides that appear in the results for all four states. This determines the overlapping peptide repertoire detected across all four experiments, and produces subset result files that only include these peptides. """ psmByCondition = defaultdict(list) for mode, subdir, par in mode_subdirs: files = typeInDir(os.path.join(directory, subdir), 'xlsx') conditionPSMs = [] for resultfile in files: if not 'FDR' in resultfile: continue conditionPSMs += list(reader(resultfile)) psmByCondition[subdir] = collectByCriterion(conditionPSMs, peptideKey) consistentPSMs = reduce(set.intersection, [set(x.keys()) for x in psmByCondition.values()], set(psmByCondition.values()[0].keys())) newSubdirs = [] for mode, subdir, par in mode_subdirs: newSubdir = subdir + '_intersection_sheets' newSubdirs.append((mode, newSubdir)) try: os.mkdir(os.path.join(directory, newSubdir)) except: pass files = typeInDir(os.path.join(directory, subdir), 'xlsx') for filename in files: alreadySeenPeptides = set() if not 'FDR' in filename: continue psms = reader(filename) filterfile = writer(os.path.join(directory, newSubdir, os.path.basename(filename)), columns=psms.columns) for psm in psms: pepKey = peptideKey(psm) if pepKey in consistentPSMs and pepKey not in alreadySeenPeptides: alreadySeenPeptides.add(pepKey) filterfile.write(psm) filterfile.close() return newSubdirs
def get_pycomet_lookup(self): from multiplierz.internalAlgorithms import collectByCriterion command = """SELECT name, site, specset FROM modsites""" self.cur.execute(command) specmoddata = self.cur.fetchall() specmodsets = collectByCriterion(specmoddata, lambda x: (x[0], x[2])) lookup = {} for specmodset in list(specmodsets.values()): modname = specmodset[0][0] delta = self.get_mod_delta(modname) sites = sorted(set(select(1, specmodset))) massLookup = {} for site in sites: massLookup[siteTypeLookup[site]] = delta lookup[modname] = massLookup return lookup
def cross_report_key(self): criteriaByFile = {} def criterion(psm): return tuple([psm[x] for x in self.criteria]) for filename, rdr in self.inputfiles: criteriaByFile[filename] = collectByCriterion(rdr, criterion) entities = set(sum([x.keys() for x in criteriaByFile.values()], [])) for thing in entities: outrow = {'Key': '__'.join(map(str, thing))} for filename, byCriterion in criteriaByFile.items(): outrow[filename] = thing in byCriterion self.output.write(outrow) print "Report key generated."
def plotPeptideXIC(self): assert self.proteins if not self.plot_peptide: #print "Should clear plot here." self.xicPlot.clear() return psms = self.proteins[self.plot_accession][self.plot_peptide] mz = average([float(x['Experimental mz']) for x in psms]) psmsByFile = collectByCriterion(psms, lambda x: x['Datafile']) rts = [] for datafile, dfpsms in psmsByFile.items(): scans = [int(x['Spectrum Description'].split('.')[1]) for x in dfpsms] rts += [self.dataPtrs[datafile].map_timeForScan(x) for x in scans] frts = [] rangesByFile = {} for datafile, dfpsms in psmsByFile.items(): featureIndices = [int(x['Feature']) for x in dfpsms if x['Feature'] != '-'] features = [self.features[datafile][x] for x in featureIndices] scanranges = [x.scanrange for x in features] rtranges = [[self.dataPtrs[datafile].timeForScan(x) for x in xs] for xs in scanranges] rtranges = [[applyAlignmentToPoint(self.curves[datafile], x, inverse = True) for x in xs] for xs in rtranges] rangesByFile[datafile] = rtranges frts += list(sum(rangesByFile[datafile], [])) bothRTs = rts + frts span = min(bothRTs), max(bothRTs) if self.plot_scan: scan, scandatafile = [x.strip() for x in self.plot_scan.split('---')] scanrt = self.dataPtrs[scandatafile].map_timeForScan(int(scan)) else: scanrt = None self.xicPlot.plotXICs(self.dataPtrs, span, mz, scanrt, rangesByFile)
def entries_in_common(self): criteriaByFile = {} def criterion(psm): return tuple([psm[x] for x in self.criteria]) for filename, rdr in self.inputfiles: criteriaByFile[filename] = collectByCriterion(rdr, criterion) for filename, byCriterion in criteriaByFile.items(): for thing, psms in byCriterion.items(): if all( [thing in criteriaByFile[x] for x, _ in self.inputfiles]): for psm in psms: outpsm = dict([(k, v) for k, v in psm.items() if k in self.outcolumns]) outpsm['Source'] = filename self.output.write( outpsm) # Currently writes all instances. print "Common entry report generated."
def unique_by_file(self): criteriaByFile = {} def criterion(psm): return tuple([psm[x] for x in self.criteria]) for filename, rdr in self.inputfiles: criteriaByFile[filename] = collectByCriterion(rdr, criterion) for filename, byCriterion in criteriaByFile.items(): for thing, psms in byCriterion.items(): if not any([ thing in criteriaByFile[x] for x, _ in self.inputfiles if x != filename ]): outpsm = dict([(k, v) for k, v in psms[0].items() if k in self.outcolumns]) outpsm['Source'] = filename self.output.write( outpsm ) # Currently writes only one instance of a unique entry. print "Unique entry report generated."
def psm_XIC_localized(directory, subdirs): """ A peptide may appear in multiple fractions due various factors, but for the purpose of this analysis it is useful to consider a peptide as "belonging" only to the fraction in which the main bulk of the elution occurred. For each fraction in which a given peptide appeared, we take XICs over the m/z values for a set of possible charge and compare their total intensity; the fraction with the most intense XIC(s) is assigned that peptide for the final count. """ tolerance = 0.1 time_tolerance = 15 rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x))) for x in os.listdir(directory) if x.lower().endswith('raw')]) columns = None start = time.clock() for subdir in subdirs: resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx') resultfiles = [x for x in resultfiles if 'XIC_localized' not in x] peptidesForFile = defaultdict(dict) for resultfile in resultfiles: rdr = reader(resultfile) columns = rdr.columns psmsByPeptide = collectByCriterion( list(rdr), lambda x: (x['Peptide Sequence'], x['Variable Modifications'])) for peptide, psms in psmsByPeptide.items(): peptidesForFile[peptide][resultfile] = psms outputByFile = defaultdict(list) for peptide, psmsByFile in peptidesForFile.items(): xicsByFile = [] allPSMs = sum(psmsByFile.values(), []) mass = allPSMs[0]['Predicted mr'] assert len(set(x['Predicted mr'] for x in allPSMs)) == 1 charges = set(x['Charge'] for x in allPSMs) allScans = set([ tuple(x['Spectrum Description'].split('.')[:2]) for x in allPSMs ]) allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1])) for x in allScans) minRT, maxRT = min(allRTs), max(allRTs) for resultfile, psms in psmsByFile.items(): rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])] xicInt = 0 for charge in charges: mz = (mass + (1.0072764 * charge)) / charge xic = rawfile.xic(minRT - time_tolerance, maxRT + time_tolerance, mz - tolerance, mz + tolerance) xicInt += sum(zip(*xic)[1]) xicsByFile.append((xicInt, resultfile)) highIntFile = max(xicsByFile, key=lambda x: x[0])[1] outputByFile[highIntFile].append(psmsByFile[highIntFile][0]) for resultfile, psms in outputByFile.items(): outputfile = resultfile[:-5] + '.XIC_localized.xlsx' output = writer(outputfile, columns=columns) for psm in psms: output.write(psm) output.close()