def mascot_headers(self, report_file, mascot_headers): '''Combines a list of Mascot header pages into a single page (XLS or MZD)''' logger_message(30, 'Adding Mascot Headers...') first_columns = set() columns = defaultdict(dict) report_files = [f for f, mh in mascot_headers] for f, mascot_header in mascot_headers: first_col = [] for line in mascot_header[1:]: if line[0] == ' ' or (isinstance(line[1], (str, unicode)) and line[1].startswith('-----')): continue columns[f][line[0]] = line[1] first_col.append(line[0]) first_columns.add(tuple(first_col)) if len(first_columns) > 1: logger_message(20, 'Headers seems different, will try to merge them') main_h = list(max(first_columns, key=len)) main_h += sorted( reduce(set.union, first_columns, set()).difference(main_h)) else: main_h = list(first_columns.pop()) cols = ['Header'] + [(os.path.basename(f) if f else ('-' * 50)) for f in report_files] if report_file.lower().endswith('.mzd'): mascot_rep = mzReport.writer(report_file, columns=cols, table_name='MascotHeader') else: mascot_rep = mzReport.writer(report_file, columns=cols, sheet_name='Mascot_Header') for col in main_h: row = [col] for f in report_files: if col in columns[f]: row.append(columns[f][col]) else: row.append(None) mascot_rep.write(row) mascot_rep.close()
def prot_report(self, report_file, prot_report): '''Adds a protein page to a report (XLS or MZD)''' logger_message(30, 'Adding Protein Info...') if report_file.lower().endswith('.mzd'): prot_rep = mzReport.mzDB.sqlite3.connect(report_file) prot_rep.execute( 'create view ProteinData as select ' '"Protein Rank","Accession Number","Protein Description",' '"Protein Mass","Protein Matches","Protein Score",' 'count(distinct "Peptide Sequence") as "Unique Peptides"' ' from PeptideData group by "Protein Rank","Accession Number"') prot_rep.close() else: cols = [ 'Protein Rank', 'Accession Number', 'Protein Description', 'Protein Mass', 'Protein Matches', 'Protein Score', 'Unique Peptides' ] prot_rep = mzReport.writer(report_file, columns=cols, sheet_name='Protein') for line in prot_report: prot_rep.write(line) prot_rep.close()
def calculate(self, event): datafile = self.ionFileCtrl.GetValue() ionStr = self.ionSelectCtrl.GetValue() ions = [float(x) for x in ionStr.split()] tolerance = float(self.toleranceCtrl.GetValue()) threshold = float(self.thresholdCtrl.GetValue()) saveOutput = self.saveAsCheck.GetValue() if saveOutput: outputFile = self.saveAsCtrl.GetValue() if not outputFile: raise IOError, "Output file not selected." results, resultColumns = findIonsInData(datafile, ions, tolerance, threshold, includeColumns=True) if saveOutput: output = writer(outputFile, columns=resultColumns) for row in results: output.write(row) output.close() else: resultDisplay = FoundIonsDisplay(self, -1, resultColumns, results) resultDisplay.Show()
def convert_report(reportfile, outputfile=None): # Reads .mzid file to generate a Mascot-like report which has most # important fields; doesn't report protein-level statistics, missed cleavages, # or Mascot-specific fields (e.g., query.) from multiplierz.mzTools.mzIdentMLAPI import mzIdentML from multiplierz.mass_biochem import remove_protons from multiplierz.mzReport import reader, writer if not outputfile: outputfile = reportfile + '.csv' report = mzIdentML(reportfile) out = writer(outputfile, columns=[ 'File', 'Rank', 'Accession Number', 'Protein Description', 'Peptide Sequence', 'Variable Modifications', 'Experimental mz', 'Predicted mr', 'Charge', 'Delta', 'Peptide Score', 'Expectation Value', 'Start Position', 'End Position', 'Preceding Residue', 'Following Residue', 'Spectrum Description' ]) for row in report.peptideSummary(): row['Predicted mr'] = remove_protons(row['Calculated mz'], int(row['Charge'])) del row['Calculated mz'] del row['Passed Threshold'] del row['Peptide ID'] del row['Spectrum ID'] out.write(row) out.close()
def detect_matches(file_names, fields=[], tol_field=None, tolerance=0.0, save_file=''): #mzTools.logger_message(30,'Detecting Matches...') #fields = fields or [mzReport.multiplierzHeaders[k] #for k in ['acc','seq','var_mods']] detect_gen = _detect_matches(file_names, fields, tol_field, tolerance) cols = detect_gen.next() match_out = [cols] if save_file: writer = mzReport.writer(save_file, columns=cols) for line in detect_gen: match_out.append(line) if save_file: writer.write(line) detect_gen.close() if save_file: writer.close() #mzTools.logger_message(20,'Matches Detected') return match_out
def combine_peptides(reportfile, isobaric=None, outputfile=None): from multiplierz.mzReport import reader, writer from multiplierz.mgf import standard_title_parse isobaric_labels = { None: [], 4: ['114', '115', '116', '117'], 6: ['126', '127', '128', '129', '130', '131'], 8: ['113', '114', '115', '116', '117', '118', '119', '121'], 10: [ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ] } def _byPeptide(row): # Not counting charge. varmodset = frozenset( [x.strip() for x in row['Variable Modifications'].split(';')]) return row['Peptide Sequence'], varmodset def _getReporters(row): attrib = standard_title_parse(row['Spectrum Description']) return [float(attrib[x.lower()]) for x in isobaric_labels[isobaric]] assert isobaric in isobaric_labels psms = reader(reportfile) rowsByPeptide = collectByCriterion(psms, _byPeptide) sum_cols = ['Sum%s' % x for x in isobaric_labels[isobaric]] top_cols = ['Max%s' % x for x in isobaric_labels[isobaric]] if not outputfile: outputfile = insert_tag(reportfile, 'peptide_combined') output = writer(outputfile, columns=(psms.columns + sum_cols + top_cols + ['PSMs'])) for pep, psms in rowsByPeptide.items(): outrow = max(psms, key=lambda x: x['Peptide Score']) outrow['PSMs'] = len(psms) if isobaric: repsets = [_getReporters(x) for x in psms] toprepset = max(repsets, key=lambda x: sum(x)) sumrepset = [sum(x) for x in zip(*repsets)] for rep, col in zip(toprepset, top_cols): outrow[col] = rep for rep, col in zip(sumrepset, sum_cols): outrow[col] = rep output.write(outrow) output.close() return outputfile
def mascot_header(self, report_file, mascot_header): '''Adds a Mascot Header page to a report (XLS or MZD)''' logger_message(30, 'Adding Mascot Header...') if report_file.lower().endswith('.mzd'): mascot_rep = mzReport.writer(report_file, columns=mascot_header[0], table_name='MascotHeader') else: mascot_rep = mzReport.writer(report_file, columns=mascot_header[0], sheet_name='Mascot_Header') for line in mascot_header[1:]: mascot_rep.write(line) mascot_rep.close() logger_message(20, 'Mascot Header Complete')
def concatenate_reports(reportfiles, outputfile): from multiplierz.mzReport import reader, writer readers = map(reader, reportfiles) allcols = sorted(set.intersection(*map(set, [x.columns for x in readers])), key = lambda x: readers[0].columns.index(x)) if not all(x.columns == allcols for x in readers): print "Warning- concatenation drops some columns!" output = writer(outputfile, columns = allcols) for report in readers: for row in report: output.write(row, ignore_extra = True) output.close() return outputfile
def dispatchModes(self, event): self.runButton.Enable(False) mode = self.modeCtrl.GetString(self.modeCtrl.GetSelection()) self.criteria = self.fieldsCtrl.GetCheckedStrings() self.inputfiles = [(x, reader(x)) for x in self.fileList.GetStrings()] outputfile = self.outputCtrl.GetValue() if not outputfile: outputfile = 'combined_output_file' if not outputfile.split('.')[-1].lower() in ('xls', 'xlsx', 'csv', 'mzd'): outputfile += '.xlsx' if not os.path.isabs(outputfile): outdir = os.path.dirname(self.inputfiles[0][0]) outputfile = os.path.join(outdir, outputfile) if mode in [ 'Concatenate All', 'Unique-by-File Report', 'Entries-in-Common Report' ]: columnsets = [x[1].columns for x in self.inputfiles] columnIntersection = reduce(set.intersection, columnsets, set(columnsets[0])) self.outcolumns = ['Source'] + [ x for x in columnsets[0] if x in columnIntersection ] if mode != 'Concatenate All': assert all([x in self.outcolumns for x in self.criteria]) elif mode in ['Cross-Report Key']: self.outcolumns = ['Key'] + [x[0] for x in self.inputfiles] else: raise Exception self.output = writer(outputfile, columns=self.outcolumns) if mode == 'Concatenate All': self.concatenate() elif mode == 'Cross-Report Key': self.cross_report_key() elif mode == 'Unique-by-File Report': self.unique_by_file() elif mode == 'Entries-in-Common Report': self.entries_in_common() else: raise Exception self.output.close() print "Wrote %s" % outputfile self.runButton.Enable(True)
def combine_accessions(reportfile, outputfile = None): """ Given a Mascot-style PSM report, this combines all protein hypotheses for a given MS2 spectrum into a single PSM. outputfile may be safely specified to be the same as the input file, in order to overwrite the original file. """ from multiplierz.mzReport import reader, writer report = reader(reportfile) columns = report.columns molecules = defaultdict(list) for row in report: molecules[row['Spectrum Description']].append(row) outputData = [] for rows in molecules.values(): accessions = [x['Accession Number'] for x in rows] newRow = max(rows, key = lambda x: x['Peptide Score']) if 'Accession Number' in columns: newRow['Accession Number'] = '; '.join([x['Accession Number'] for x in rows]) if 'Protein Description' in columns: newRow['Protein Description'] = '; '.join([x['Protein Description'] for x in rows]) if 'Protein Masses' in columns: newRow['Protein Masses'] = '; '.join([str(x['Protein Mass']) for x in rows]) newRow['Protein Redundancy'] = len(rows) outputData.append(newRow) try: columns = [x for x in columns + ['Protein Masses'] if x in newRow] except UnboundLocalError: pass # Means there was no newRow, and thus no rows, so it's pretty arbitrary. if not outputfile: outputfile = insert_tag(reportfile, 'combined_accessions') output = writer(outputfile, columns = columns + ['Protein Redundancy']) report.close() for row in outputData: output.write(row) output.close() return outputfile
def psm_intersection(directory, mode_subdirs): """ To give a more accurate depiction of the relative elution profile of each label state, the final results will only consider peptides that appear in the results for all four states. This determines the overlapping peptide repertoire detected across all four experiments, and produces subset result files that only include these peptides. """ psmByCondition = defaultdict(list) for mode, subdir, par in mode_subdirs: files = typeInDir(os.path.join(directory, subdir), 'xlsx') conditionPSMs = [] for resultfile in files: if not 'FDR' in resultfile: continue conditionPSMs += list(reader(resultfile)) psmByCondition[subdir] = collectByCriterion(conditionPSMs, peptideKey) consistentPSMs = reduce(set.intersection, [set(x.keys()) for x in psmByCondition.values()], set(psmByCondition.values()[0].keys())) newSubdirs = [] for mode, subdir, par in mode_subdirs: newSubdir = subdir + '_intersection_sheets' newSubdirs.append((mode, newSubdir)) try: os.mkdir(os.path.join(directory, newSubdir)) except: pass files = typeInDir(os.path.join(directory, subdir), 'xlsx') for filename in files: alreadySeenPeptides = set() if not 'FDR' in filename: continue psms = reader(filename) filterfile = writer(os.path.join(directory, newSubdir, os.path.basename(filename)), columns=psms.columns) for psm in psms: pepKey = peptideKey(psm) if pepKey in consistentPSMs and pepKey not in alreadySeenPeptides: alreadySeenPeptides.add(pepKey) filterfile.write(psm) filterfile.close() return newSubdirs
def concatenate_reports(reportfiles, outputfile, include_file_column=False): from multiplierz.mzReport import reader, writer readers = list(map(reader, reportfiles)) allcols = sorted( set.intersection(*list(map(set, [x.columns for x in readers]))), key=lambda x: readers[0].columns.index(x)) if not all(x.columns == allcols for x in readers): print("Warning- concatenation drops some columns!") output = writer(outputfile, columns=allcols) for filename, report in zip(reportfiles, readers): filename = os.path.basename(filename) for row in report: if include_file_column: row['FILE'] = filename output.write(row, ignore_extra=True) output.close() return outputfile
def combineFiles(files, outputFile, ext): if not os.path.isabs(outputFile): outputFile = os.path.join(os.path.dirname(files[0]), os.path.basename(outputFile)) if not outputFile[-1*len(ext):] == ext: outputFile += ext print "Merging %s" % files columns = reader(files[0]).columns output = writer(outputFile, columns = ['Source'] + columns) for filename in files: for row in reader(filename): row['Source'] = os.path.basename(filename) output.write(row) output.close() print "Wrote %s !" % outputFile
def psm_XIC_localized(directory, subdirs): """ A peptide may appear in multiple fractions due various factors, but for the purpose of this analysis it is useful to consider a peptide as "belonging" only to the fraction in which the main bulk of the elution occurred. For each fraction in which a given peptide appeared, we take XICs over the m/z values for a set of possible charge and compare their total intensity; the fraction with the most intense XIC(s) is assigned that peptide for the final count. """ tolerance = 0.1 time_tolerance = 15 rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x))) for x in os.listdir(directory) if x.lower().endswith('raw')]) columns = None start = time.clock() for subdir in subdirs: resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx') resultfiles = [x for x in resultfiles if 'XIC_localized' not in x] peptidesForFile = defaultdict(dict) for resultfile in resultfiles: rdr = reader(resultfile) columns = rdr.columns psmsByPeptide = collectByCriterion( list(rdr), lambda x: (x['Peptide Sequence'], x['Variable Modifications'])) for peptide, psms in psmsByPeptide.items(): peptidesForFile[peptide][resultfile] = psms outputByFile = defaultdict(list) for peptide, psmsByFile in peptidesForFile.items(): xicsByFile = [] allPSMs = sum(psmsByFile.values(), []) mass = allPSMs[0]['Predicted mr'] assert len(set(x['Predicted mr'] for x in allPSMs)) == 1 charges = set(x['Charge'] for x in allPSMs) allScans = set([ tuple(x['Spectrum Description'].split('.')[:2]) for x in allPSMs ]) allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1])) for x in allScans) minRT, maxRT = min(allRTs), max(allRTs) for resultfile, psms in psmsByFile.items(): rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])] xicInt = 0 for charge in charges: mz = (mass + (1.0072764 * charge)) / charge xic = rawfile.xic(minRT - time_tolerance, maxRT + time_tolerance, mz - tolerance, mz + tolerance) xicInt += sum(zip(*xic)[1]) xicsByFile.append((xicInt, resultfile)) highIntFile = max(xicsByFile, key=lambda x: x[0])[1] outputByFile[highIntFile].append(psmsByFile[highIntFile][0]) for resultfile, psms in outputByFile.items(): outputfile = resultfile[:-5] + '.XIC_localized.xlsx' output = writer(outputfile, columns=columns) for psm in psms: output.write(psm) output.close()
def evaluateTMTiTRAQ(outputfile, columns, results, resultIntMap): peptides = defaultdict(list) for psm in results: try: varmods = [x for x in psm['Variable Modifications'].split(';') if 'plex' not in x] except AttributeError: varmods = '' peptides[psm['Peptide Sequence'], '; '.join(varmods)].append(psm) output = writer(outputfile, sheet_name = 'Data', columns = columns + ['Lysines','Lysine Labels','N-term Label', 'Fully Labelled', 'Intensity']) evaluation = {'total peptides':0, 'total lysines':0, 'fully labelled':0, 'nterm labelled':0, 'lysine labelled':0} labelSummary = [] for key, pepSet in list(peptides.items()): fullLabelledInt = 0 partLabelledInt = 0 seq = key[0] lysCount = seq.count('K') for psm in pepSet: assert seq == psm['Peptide Sequence'] # Should work now? lysLabelCount = len([x for x in psm['Variable Modifications'].split('; ') if x and x[0] == 'K' and 'plex' in x]) ntermLabel = any([(x[:len('N-term')] == 'N-term' and 'plex' in x) for x in psm['Variable Modifications'].split('; ')]) psm['Lysines'] = lysCount psm['Lysine Labels'] = lysCount psm['N-term Label'] = ntermLabel try: intensity = resultIntMap[int(psm['Spectrum Description'].split('.')[1])] except KeyError: intensity = 0 if (lysLabelCount == lysCount) and ntermLabel: fullLabelledInt += intensity psm['Fully Labelled'] = True else: partLabelledInt += intensity psm['Fully Labelled'] = False psm['Intensity'] = intensity output.write(psm) evaluation['total peptides'] += 1 evaluation['total lysines'] += lysCount evaluation['fully labelled'] += 1 if not partLabelledInt else 0 evaluation['nterm labelled'] += int(ntermLabel) evaluation['lysine labelled'] += lysLabelCount labelSummary.append((key, fullLabelledInt, partLabelledInt)) output.close() summaryOutput = writer(outputfile, sheet_name = 'Label Report', columns = ['Peptide Sequence', 'Non-Label Mods', '% Labelling']) for (seq, varmods), fullLabelledInt, partLabelledInt in labelSummary: row = {} row['Peptide Sequence'] = seq row['Non-Label Mods'] = varmods if partLabelledInt + fullLabelledInt: row['% Labelling'] = fullLabelledInt / (partLabelledInt + fullLabelledInt) else: row['% Labelling'] = '' summaryOutput.write(row) summaryOutput.close() return evaluation
def filterJoin(filenames, matchColumns, returnMode, outputKeyFile, combinedOutputFile=None, outputFileType='.xlsx', outputTag=None, tolerance=None, toleranceColumn=None): """ Produces a joined file, filtering out either repeat or unique PSMS. If returnMode is 'matched', output file contains one instance of each PSM group; if returnMode is 'unmatched', the output file contains every PSM that wasn't part of a larger PSM group. (Where 'PSM group' is a set of PSMs that are identical based on the given matchColumns + toleranceColumn.) If 'both', both kinds of output are produced. Returns output file name(s); a tuple in the case of 'both.' """ assert returnMode in ['matched', 'unmatched', 'both'] #if not outputFileBase: #outputFileBase = filenames[0] data = [] columnLists = [] for filename in filenames: subdata = [] inputfile = reader(filename) columnLists.append(inputfile.columns) for psm in inputfile: psm['Source'] = filename subdata.append(psm) data.append(subdata) inputfile.close() assert all([columnLists[0] == x for x in columnLists]), "Heterogeneous data columns!" datadict = defaultdict(list) for subdata in data: for psm in subdata: signature = tuple([psm[x] for x in matchColumns]) datadict[signature].append(psm) if toleranceColumn: toldatadict = {} for signature, sigGroup in datadict.items(): subGroups = [] for psm in sigGroup: match = False for subGroup in subGroups: if all([ abs(psm[toleranceColumn] - subpsm[toleranceColumn]) < tolerance for subpsm in subGroup ]): match = True subGroup.append(psm) break if not match: subGroups.append([psm]) for index, subGroup in enumerate(subGroups): subSig = tuple(list(signature) + index) toldatadict[subSig] = subGroup datadict = toldatadict if outputKeyFile: keyfile = writer(outputKeyFile, columns=['PSM Key'] + filenames) for signature, psmGroup in datadict.items(): line = {} line['PSM Key'] = '|'.join([str(x) for x in signature]) line.update([(x, len([y for y in psmGroup if y['Source'] == x])) for x in filenames]) keyfile.write(line) keyfile.close() outputpsms = [] if returnMode == 'matched' or returnMode == 'both': #outputFileName = outputFileBase + '_matchedPSMs' + outputFileType #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0]) for psmGroup in datadict.values(): if len(psmGroup) > 1: exemplar = psmGroup[0] sourceFiles = '; '.join(set([x['Source'] for x in psmGroup])) exemplar['source'] = sourceFiles #outputfile.write(exemplar) outputpsms.append(exemplar) #outputfile.close() if returnMode == 'unmatched' or returnMode == 'both': #outputFileName = outputFileBase + '_uniquePSMs' + outputFileType #outputfile = writer(outputFileName, columns = ['Source'] + columnLists[0]) for psmGroup in datadict.values(): if len(psmGroup) == 1: #outputfile.write(psmGroup[0]) outputpsms.append(psmGroup[0]) #outputfile.close() outputs = [] if outputTag: outputfiles = [ (x, '.'.join(x.split('.')[:-1] + [outputTag, outputFileType])) for x in filenames ] for filename, outputfile in outputfiles: output = writer(outputfile, columns=['Source'] + columnLists[0]) for psm in [x for x in outputpsms if x['Source'] == filename]: output.write(psm) output.close() outputs = [x[1] for x in outputfiles] if combinedOutputFile: output = writer(combinedOutputFile, columns=['Source'] + columnLists[0]) for psm in outputpsms: output.write(psm) output.close() outputs.append(combinedOutputFile) return outputs
def filter_join(file_names, key_source_file, exclude=False, append=False, save_file_suffix='_filtered', ext='.xls'): if (not append) and (not save_file_suffix): raise ValueError( 'Save_file_suffix cannot be empty string when not combining output.' ) filtered_files = _filter_join(file_names, key_source_file, exclude, save_file_suffix) #if not filtered_files: #return cols = filtered_files.next() #Convert csv files to xls and append to manipulations log if append: combo_out_file = os.path.join(os.path.dirname(key_source_file), 'Combined%s%s' % (save_file_suffix, ext)) if os.path.exists(combo_out_file): os.remove(combo_out_file) # union of file columns--checking to see if files have the same columns #union_cols = reduce(set.union, [set(f[1]) for f in filtered_files]) union_cols = reduce(set.union, [set(c) for c in cols]) # if the union is the size of the smallest set, they are all equal #if len(union_cols) == min(len(f[1]) for f in filtered_files): if len(union_cols) == min(len(c) for c in cols): same_cols = True if 'File' not in union_cols: #newcols = ['File'] + filtered_files[0][1] # they're all the same so use the first one newcols = ['File'] + list( cols[0]) # they're all the same so use the first one else: #newcols = filtered_files[0][1] newcols = list(cols[0]) else: # files have different columns same_cols = False if 'File' not in union_cols: # use the largest set of columns as initial template #newcols = ['File'] + max((f[1] for f in filtered_files), key=len) newcols = ['File'] + list(max(cols, key=len)) # stick the remaining columns on the end, sorted as strings newcols += sorted(union_cols.difference(newcols)) else: # use the largest set of columns as initial template #newcols = max((f[1] for f in filtered_files), key=len) newcols = list(max(cols, key=len)) # stick the remaining columns on the end, sorted as strings newcols += sorted(union_cols.difference(newcols)) if 'Filter Key' not in newcols: newcols.insert(1, 'Filter Key') combo_report = mzReport.writer(combo_out_file, columns=newcols) # if the columns are all the same, this is straightforward, just write them all out if same_cols: for (filename, columns, filedata) in filtered_files: for row in filedata: row['File'] = filename # will either add File or overwrite it combo_report.write(row) # if not, it's slightly more complicated--create dictionary for each row with None # if a value is missing else: blank_row = dict((c.lower(), None) for c in newcols) for (filename, columns, filedata) in filtered_files: for row in filedata: new_row = blank_row.copy() new_row.update(row) new_row[ 'file'] = filename # will either add File or overwrite it combo_report.write(new_row) combo_report.close() else: for (filename, columns, filedata) in filtered_files: # write each sheet to a separate file if 'Filter Key' not in columns: columns = ['Filter Key'] + columns rep = mzReport.writer(save_file_suffix.join( os.path.splitext(filename)), columns=columns) for row in filedata: rep.write(row) rep.close()
def calculate_FDR(reportfile, outputfile = None, threshold = 0.01, decoyString = 'rev_', includeStatisticsSheet = True, includeDuplicates = True, separateDuplicateSheet = True, includeFailedSheet = True, includeReverseSheet = True, single_cutoff = True): """ Performs Forward/Reverse database filtering on the target file, giving back the true PSMs over the specified statistical threshold as well as removed decoy and below-threshold PSMs, in respective sheets. All entries in the decoy (reverse) database must have accessions that begin with some uniform prefix; by default, "rev_" (so that gi|198292342|X7823_EXTRA becomes rev_gi|198292342|X7823_EXTRA.) outputfile may be safely specified to be the same as the input file, in order to overwrite the original file. """ from multiplierz.mzReport import reader, writer reportReader = reader(reportfile) reportRows = list(reportReader) columns = reportReader.columns + ['FDR'] reportReader.close() reportRows.sort(key = lambda x: x['Peptide Score'], reverse = True) seenSpectra = {} passedRows = [] failedRows = [] duplicateRows = [] reverseRows = [] reverses = 0.0 forwards = 0.0 duplicates = 0 passed = 0 failed = 0 lowPass = 999999999 highRev = 0 for row in reportRows: specDesc = row['Spectrum Description'] if specDesc in seenSpectra: duplicates += 1 fdr = seenSpectra[specDesc] row['FDR'] = fdr if includeDuplicates and not separateDuplicateSheet: if fdr < threshold: passedRows.append(row) else: failedRows.append(row) else: duplicateRows.append(row) continue #if decoyString in row['Accession Number'].lower(): # Turns out that produced awful results, since high-scoring peptides # could just happen to be duplicated in the reverse database. # So instead: if all([decoyString in x.lower() for x in row['Accession Number'].split(';')]): reverses += 1 if forwards: fdr = reverses / forwards else: fdr = 100 row['FDR'] = fdr if float(row['Peptide Score']) > highRev: highRev = float(row['Peptide Score']) seenSpectra[specDesc] = fdr reverseRows.append(row) else: forwards += 1 fdr = reverses / forwards row['FDR'] = fdr seenSpectra[specDesc] = fdr if fdr < threshold: passed += 1 passedRows.append(row) if float(row['Peptide Score']) < lowPass: lowPass = float(row['Peptide Score']) else: failed += 1 failedRows.append(row) if single_cutoff: recovered = [x for x in failedRows if x['Peptide Score'] > lowPass] failedRows = [x for x in failedRows if x['Peptide Score'] <= lowPass] passedRows += recovered if not outputfile: # Output format must support sheets. if reportfile.lower().endswith('xlsx') or reportfile.lower().endswith('xls'): outputfile = insert_tag(reportfile, 'FDR_filtered') else: outputfile = '.'.join(reportfile.split('.')[:-1] + ['FDR_filtered.xlsx']) percentage = round(threshold * 100) if includeFailedSheet: failedOutput = writer(outputfile, columns = columns, sheet_name = "Failed %s%% FDR" % percentage) for row in failedRows: failedOutput.write(row) failedOutput.close() if separateDuplicateSheet: duplicateOutput = writer(outputfile, columns = columns, sheet_name = "Duplicate Rows") for row in duplicateRows: duplicateOutput.write(row) duplicateOutput.close() if includeReverseSheet: reverseOutput = writer(outputfile, columns = columns, sheet_name = 'Reverse Hits') for row in reverseRows: reverseOutput.write(row) reverseOutput.close() if includeStatisticsSheet: statOutput = writer(outputfile, columns = ['FDR Calculation Statistics', '--------------'], sheet_name = "FDR Statistics") statOutput.write(['', '']) statOutput.write(['Total Spectra', str(len(reportRows))]) statOutput.write(['Passed %s%% FDR' % percentage, str(passed)]) statOutput.write(['Lowest Passing Score', str(lowPass)]) statOutput.write(['Reverse Hits', str(reverses)]) statOutput.write(['Highest Scoring Reverse Hit', str(highRev)]) statOutput.write(['Number of Duplicates', str(duplicates)]) statOutput.close() passedOutput = writer(outputfile, columns = columns, sheet_name = "Data") for row in passedRows: passedOutput.write(row) passedOutput.close() return outputfile
def evaluateSILAC(outputfile, columns, results, featureIntMap): peptides = defaultdict(list) for psm in results: if psm['Variable Modifications']: varmods = [x for x in psm['Variable Modifications'].split(';') if 'Label' not in x] else: varmods = '' peptides[psm['Peptide Sequence'], '; '.join(varmods)].append(psm) output = writer(outputfile, sheet_name = 'Data', columns = columns + ['Lysines', 'Arginines', 'Lysine Labels', 'Arginine Labels', 'Fully Labelled', 'Intensity']) evaluation = {'total peptides':0, 'total lysines':0, 'total arginines':0, 'fully labelled':0, 'lysine labelled':0, 'arginine labelled':0} labelSummary = [] for key, pepSet in list(peptides.items()): fullLabelledInt = 0 partLabelledInt = 0 seq = key[0] lysCount = seq.count('K') argCount = seq.count('R') if not (lysCount + argCount): for psm in pepSet: psm['Lysines'] = 0 psm['Arginines'] = 0 psm['Lysine Labels'] = 0 psm['Arginine Labels'] = 0 psm['Fully Labelled'] = 'N/A' psm['Intensity'] = '' output.write(psm) continue for psm in pepSet: assert seq == psm['Peptide Sequence'] if psm['Variable Modifications']: varmods = psm['Variable Modifications'].split(';') else: varmods = '' lysLabelCount = len([x for x in varmods if x[0] == 'K' and 'Label' in x]) argLabelCount = len([x for x in varmods if x[0] == 'R' and 'Label' in x]) psm['Lysines'] = lysCount psm['Arginines'] = argCount psm['Lysine Labels'] = lysLabelCount psm['Arginine Labels'] = argLabelCount try: intensity = featureIntMap[int(psm['Spectrum Description'].split('.')[1])] except KeyError: intensity = 0 if lysLabelCount == lysCount and argCount == argLabelCount: fullLabelledInt += intensity psm['Fully Labelled'] = True else: partLabelledInt += intensity psm['Fully Labelled'] = False psm['Intensity'] = intensity output.write(psm) evaluation['total peptides'] += 1 evaluation['total lysines'] += lysCount evaluation['total arginines'] += argCount evaluation['fully labelled'] += int(psm['Fully Labelled']) evaluation['lysine labelled'] += lysLabelCount evaluation['arginine labelled'] += argLabelCount labelSummary.append((key, fullLabelledInt, partLabelledInt)) output.close() summaryOutput = writer(outputfile, sheet_name = 'Label Report', columns = ['Peptide Sequence', 'Non-Label Mods', '% Labelling']) for (seq, varmods), fullLabelledInt, partLabelledInt in labelSummary: row = {} row['Peptide Sequence'] = seq row['Non-Label Mods'] = varmods if partLabelledInt + fullLabelledInt: row['% Labelling'] = fullLabelledInt / (partLabelledInt + fullLabelledInt) else: row['% Labelling'] = '' summaryOutput.write(row) summaryOutput.close() return evaluation
def get_reports(self, mascot_ids, dates=None, outputfile=None, ext=None, chosen_folder='', **report_kwargs): if ext: ext = ext.lstrip('.') report_columns = mzReport.default_columns if float(self.mascot.version ) >= 2.3 and 'Protein Database' not in report_columns: report_columns.insert(1, 'Protein Database') if dates: assert len(dates) == len( mascot_ids), "Mismatched date list provided." mascot_searches = zip(mascot_ids, dates) else: mascot_searches = [(x, None) for x in mascot_ids] reports = [] for mascot_id, date in mascot_searches: header, psms = self.retrieve_report_data(mascot_id, report_columns, date, **report_kwargs) datafilename = header[7][1] or mascot_id reports.append((mascot_id, datafilename, header, psms)) imputed_output_file_name = False if outputfile and ext: if not outputfile.lower().endswith(ext): outputfile += '.' + ext elif not outputfile: if not ext: ext = 'xlsx' outputfile = '_'.join(mascot_ids) + '.' + ext.strip('.') imputed_output_file_name = True #outputfile = '.'.join([reports[0][1], ext.strip('.')]) elif outputfile and not ext: ext = outputfile.split('.')[-1] assert ext in ['csv', 'xlsx', 'xls', 'mzd', 'mzid'] assert outputfile if chosen_folder and not os.path.isabs(outputfile): outputfile = os.path.join(chosen_folder, outputfile) if ext and 'mzid' in ext: assert len(mascot_ids) == 1, ("Combined result file not supported " "for mzIdentML files.") self.mascot.download_mzid(mascot_id, save_file=outputfile, date=date) elif len(mascot_ids) == 1: mascot_id, datafilename, header, psms = reports[0] if imputed_output_file_name or not outputfile: outputfile = datafilename + '.' + ext if chosen_folder: outputfile = os.path.join(chosen_folder, outputfile) output = mzReport.writer(outputfile, columns=header[0], sheet_name='Mascot_Header') for line in header[1:]: output.write(line) output.close() output = mzReport.writer(outputfile, columns=report_columns, sheet_name='Data') for psm in psms: output.write(psm) output.close() else: #report_columns.insert(0, 'File') if not outputfile: raise IOError("Combined report file name must be specified.") if (outputfile.lower().endswith('xls') or outputfile.lower().endswith('xlsx') or outputfile.lower().endswith('mzd')): for m_id, datafilepath, header, _ in reports: datafilename = os.path.basename(datafilepath) output = mzReport.writer(outputfile, columns=header[0], sheet_name='%s Mascot Header' % datafilename) for line in header[1:]: output.write(line) output.close() else: extension = outputfile.split('.')[-1] print "Omitting header tables due to %s format." % extension output = mzReport.writer(outputfile, columns=['File'] + report_columns, sheet_name='Data') for _, datafilepath, _, psms in reports: datafilename = os.path.basename(datafilepath) for psm in psms: psm['File'] = datafilename output.write(psm) output.close() return outputfile
def format_report(reportfile, outputfile=None, mgffile=None, parameters=None, most_rank=None, most_exp=None): """ Renders a native Comet output .txt file into an mzReport-compatible and prettier .xlsx format. (Native .txt output is noncompatible mostly due to a space instead of underscore in the 'modified peptide' column; hopefully that will be fixed soon.) """ if most_rank: most_rank = int(most_rank) if most_exp: most_exp = float(most_exp) if mgffile: from multiplierz.mgf import parse_to_generator mgfgen = parse_to_generator(mgffile) queryToDesc = dict(enumerate(x['title'] for x in mgfgen), start=1) else: queryToDesc = {} columns = [] rows = [] report = open(reportfile, 'r') headeritems = report.next().split('\t') header = { 'Program': headeritems[0], 'Data': headeritems[1], 'Search Run Time': headeritems[2], 'Database': headeritems[3].strip() } columnline = report.next() # Fix for presumed bug; omit if this column title is changed in later Comet versions. columnline = columnline.replace('peptide\tmodifications', 'peptide_modifications') def tryNum(thing): try: return int(thing) except ValueError: try: return float(thing) except ValueError: return thing columns = [ toStandardPSMConversions.get(x, x) for x in columnline.strip().split('\t') ] for line in report: values = [tryNum(x.strip()) for x in line.split('\t')] row = dict(zip(columns, values)) row = convertVarmods(row) row['Spectrum Description'] = queryToDesc.get(row['Query'], 'Unknown') rows.append(row) report.close() if not outputfile: outputfile = '.'.join(reportfile.split('.')[:-1] + ['xlsx']) if outputfile.lower().endswith('xlsx') or outputfile.lower().endswith( 'xls'): headerwriter = writer( outputfile, columns=['Program', 'Data', 'Search Run Time', 'Database'], sheet_name='Comet_Header') headerwriter.write(header) headerwriter.write(['', '', '', '']) if parameters: for setting, value in sorted(parameters.items()): headerwriter.write({ 'Program': setting, 'Data': value, 'Search Run Time': '', 'Database': '' }) headerwriter.close() mainwriter = writer(outputfile, columns=['Spectrum Description'] + columns, sheet_name='Data') for row in rows: if most_rank and row['Peptide Rank'] > most_rank: continue if most_exp and row['Expectation Value'] > most_exp: continue mainwriter.write(row) mainwriter.close() return outputfile
def feature_analysis(datafile, resultFiles, featureFile=None, tolerance=None, mzRegex=None, scanRegex=None, **constants): """ Performs feature-detection analysis on the given .RAW file and PSM reports. The output files group the given PSMs by feature, with the addition of source feature extent and intensity information. """ import os if mzRegex: import re global spectrumDescriptionToMZ mzRegCompiled = re.compile(mzRegex) def newParser(description): return float(mzRegCompiled.search(description).group()) spectrumDescriptionToMZ = newParser if scanRegex: import re global spectrumDescriptionToScanNumber scanRegCompiled = re.compile(scanRegex) def newParser(description): return int(scanRegCompiled.search(description).group()) spectrumDescriptionToScanNumber = newParser #if tolerance: #global peakFindTolerance #peakFindTolerance = tolerance #if signalNoise: #global signalToNoiseThreshold #signalToNoiseThreshold = signalNoise assert os.path.exists(datafile), "%s not found!" % datafile for resultfile in resultFiles: assert os.path.exists(resultfile), "%s not found!" % resultfile assert datafile.lower().endswith( '.raw'), "Only .raw files are currently supported." if featureFile: assert os.path.exists( featureFile ), "Specified feature data file %s not found!" % featureFile else: featureFile = detect_features(datafile, tolerance=tolerance, **constants) features = FeatureInterface(featureFile) outputfiles = [] if resultFiles: print resultFiles print "Categorizing search results by file." for resultfile in resultFiles: resultfile = os.path.abspath(resultfile) inputResults = mzReport.reader(resultfile) outputfile = '.'.join( resultfile.split('.')[:-1] + ['featureDetect', 'xlsx']) outputfiles.append(outputfile) resultsByFeature = binByFullFeature(datafile, features, inputResults) output = mzReport.writer( outputfile, columns=inputResults.columns + [ 'Feature', 'feature error', 'feature start scan', 'feature end scan', 'feature start time', 'feature end time', 'feature intensity', 'feature kurtosis', 'feature skewness' ]) for result in resultsByFeature: output.write(result) output.close() print "Output saved to %s ." % outputfile else: print "No PSM data given; skipping annotation step." return featureFile, outputfiles
def on_convert(self, event): if not self.file_list.GetStrings(): wx.MessageBox('No files selected', 'Error') return #show hourglass wx.BeginBusyCursor(wx.HOURGLASS_CURSOR) files = self.file_list.GetStrings() input_format = self.input_format.GetSelection() output_format = self.output_format.GetSelection() output_ext = { 0:'.xls', 1:'.xlsx', 2:'.csv', 3:'.mzd' }[output_format] #update statusbar self.set_status("Converting...", 0) self.set_status("", 1) if self.combineCheck.GetValue(): if input_format not in [0, 6]: wx.MessageBox("Only tabular/Excel files can currently be merged.") return combineFiles(self.file_list.GetStrings(), self.combineCtrl.GetValue(), output_ext) wx.EndBusyCursor() self.set_status("Ready", 0) self.set_status("Done", 1) return if input_format == 0: # Mascot CSV mascot_converter = mascot.mascot(version=settings.mascot_version) for file_name in files: self.set_status(file_name, 1) #Run MascotCSV program clean_csv_file = '_clean'.join(os.path.splitext(file_name)) rep_file = os.path.splitext(clean_csv_file)[0] + output_ext if os.path.exists(rep_file): os.remove(rep_file) mascot_converter.clean_csv(file_name, export_file=clean_csv_file, ion_list=False) repreader = mzReport.reader(clean_csv_file) repwriter = mzReport.writer(rep_file, columns=repreader.columns) for row in repreader: repwriter.write(row) repreader.close() repwriter.close() #if os.path.splitext(rep_file)[1].lower() in ('.xls', '.xlsx', 'mzd'): #mascot_reporter.mascot_header(rep_file, file_name) os.remove(clean_csv_file) elif input_format == 1: # Mascot DAT mascot_reporter = mzTools.MascotReport() _mascot_options = dict(max_hits=1000, ion_cutoff=20, bold_red=True, unassigned_queries=False, show_query_data=True, show_same_set=False, show_sub_set=False, quant=False) for file_name in files: self.set_status(file_name, 1) mascot_dat_file = mascot.MascotDatFile(file_name, **_mascot_options) mascot_header = mascot_dat_file.mascot_header() #mascot_header, prot_report, pep_report = mascot.parse_dat_file(file_name, **_mascot_options) ms_file_name = mascot_header[7][1] or (os.path.splitext(os.path.basename(file_name))[0]) report_file = os.path.join(os.path.dirname(file_name), os.path.basename(ms_file_name) + output_ext) if os.path.exists(report_file): os.remove(report_file) if output_ext in ('.xls', '.xlsx', '.mzd'): mascot_reporter.mascot_header(report_file, mascot_header) #mascot_reporter.mascot_header(report_file, mascot_header) if mascot_dat_file.res_file.getMascotVer() >= '2.3': report = mzReport.writer(report_file, columns=(mzReport.default_columns[:1] + ['Protein Database'] + mzReport.default_columns[1:])) else: report = mzReport.writer(report_file, default_columns=True) #for row in pep_report: for row in mascot_dat_file.peptide_report(): report.write(row) mascot_dat_file.close() report.close() #elif input_format == 2: # Mascot mzIdentML #for file_name in files: #mzid = mzIdentML(file_name) #report_file = os.path.splitext(file_name)[0] + output_ext #if os.path.exists(report_file): #os.remove(report_file) #report = mzReport.writer(report_file, default_columns=True) #for row in mzid: #report.write(row) #report.close() elif input_format == 2: # Mascot mzIdentML for file_name in files: mzid = mzIdentML(file_name) data = mzid.peptideSummary() header = data[0].keys() report_file = os.path.splitext(file_name)[0] + output_ext if os.path.exists(report_file): os.remove(report_file) report = mzReport.writer(report_file, columns = header) for row in data: writeRow = [] for column in header: thing = row[column] if type(thing) == type(['list']): thing = "; ".join(thing) writeRow.append(thing) report.write(writeRow) report.close() elif input_format == 3: # Protein Pilot for file_name in files: self.set_status(file_name, 1) pilot = ProteinPilot(file_name) pilot.format(str(os.path.splitext(file_name)[0] + output_ext)) elif input_format == 4: # OMMSA for file_name in files: self.set_status(file_name, 1) omssa = OMSSA_CSV(file_name) omssa.format(str(os.path.splitext(file_name)[0] + output_ext)) elif input_format == 5: # X!Tandem XML for file_name in files: report_file = os.path.splitext(file_name)[0] + output_ext format_XML(file_name, report_file) elif input_format == 6: # other mzReport output_method = {'.xls': mzReport.toXLS, '.xlsx': mzReport.toXLS, '.csv': mzReport.toCSV, '.mzd': mzReport.toMZD}[output_ext] for file_name in files: self.set_status(file_name, 1) rdr = reader(file_name) outputname = '.'.join(file_name.split('.')[:-1]) + output_ext wtr = writer(outputname, columns = rdr.columns) for row in rdr: wtr.write(row) wtr.close() rdr.close() #if output_ext.startswith('.xls'): #output_method(file_name, output_ext == '.xlsx') #else: #output_method(file_name) #hide hourglass wx.EndBusyCursor() self.set_status("Ready", 0) self.set_status("Done", 1)
def add_gene_ids(target_files, p2g_database, target_sheet=None, outputfile=None, inPlace=False, leucine_equals_isoleucine=True, legacy_columns=True): starttime = time.clock() if isinstance(target_files, str): return_list = False target_files = [target_files] else: return_list = True dataRdr = open(p2g_database, 'rb') data = pickle.load(dataRdr) k_len = None if isinstance(data, tuple) and len(data) == 6: k_len, seqLookup, fmerLookup, geneLookup, isoSeqLookup, isoFmerLookup = data elif isinstance(data, tuple) and not len(data) == 6: raise Exception(str(len(data))) else: print('Legacy mode P2G database detected!') seqLookup = data fmerLookup = pickle.load(dataRdr) geneLookup = pickle.load(dataRdr) try: isoSeqLookup = pickle.load(dataRdr) isoFmerLookup = pickle.load(dataRdr) except EOFError: distinguish_leucine = False isoSeqLookup = None isoFmerLookup = None dataRdr.close() if isinstance(list(geneLookup.values())[0], tuple): print("Legacy mode gene names detected.") oldTupleInstance = list(geneLookup.values())[0] nameIndex = 0 if oldTupleInstance[0] and any( x.isalpha() for x in oldTupleInstance[0]) else 1 for k, v in list(geneLookup.items()): geneLookup[k] = v[nameIndex] if leucine_equals_isoleucine: assert isoFmerLookup, ( "Pep2Gene database does not contain leucine-isoleucine " "ambiguity data; re-compile database or " "select leucine_equals_isoleucine = False .") if k_len: assert k_len == K, "Pep2Gene database created with kmers of length %s, not %s" % ( k_len, K) print("P2G database loaded: %.2f\n\n" % (time.clock() - starttime)) prevtime = time.clock() outputfiles = [] for target_file in target_files: try: rdr = reader(target_file, sheet_name=target_sheet) except TypeError: rdr = reader(target_file) # Not an Excel file. add_legacy_cols = [ "pro_count", "pro_list", "gene_count", "gene_symbols", ] add_cols = ["Protein Count", "Proteins", "Gene Count", "Gene Symbols"] if legacy_columns: new_cols = add_legacy_cols colname = dict(list(zip(add_cols, add_legacy_cols))) else: new_cols = add_cols colname = dict(list(zip(add_cols, add_cols))) iso_legacy_cols = [ 'IL Ambiguity pro_count', 'IL Ambiguity pro_list', "IL Ambiguity gene_count", "IL Ambiguity gene_symbols" ] iso_cols = [ 'I<->L Protein Count', 'I<->L Proteins', 'I<->L Gene Count', 'I<->L Gene Symbols' ] if legacy_columns and leucine_equals_isoleucine: new_cols += iso_legacy_cols colname.update(dict(list(zip(iso_cols, iso_legacy_cols)))) elif leucine_equals_isoleucine: new_cols += iso_cols colname.update(dict(list(zip(iso_cols, iso_cols)))) if (not outputfile) or return_list: ext = target_file.split('.')[-1] outputfile = '.'.join(target_file.split('.')[:-1] + ['GENES', ext]) output = writer(outputfile, columns=rdr.columns + new_cols) pepToProts = {} isoPepToProts = {} for counter, row in enumerate(rdr): if counter % 1000 == 0: print_progress(counter) try: pep = row['Peptide Sequence'].upper() except KeyError: pep = row['Peptide'].upper() pep = ''.join([x for x in pep if x.isalpha()]) if len(pep) <= K: continue # No 4-mers in a 3-mer! isoPep = pep.replace('I', 'L') if pep not in pepToProts: candidate_prots = reduce(set.intersection, (fmerLookup[pep[x:x + K]] for x in range(len(pep) - K))) # pep_find could be replaced by giving the p2g database a pre-made set of # hashes of all tryptic peptides in a protein, and seeing if the hash of the # pep is present in the set. pep_find = re.compile( '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % pep) pepToProts[pep] = set(prot for prot in candidate_prots if pep_find.search(seqLookup[prot])) if leucine_equals_isoleucine and isoPep not in isoPepToProts: iso_candidate_prots = reduce( set.intersection, (isoFmerLookup[isoPep[x:x + K]] for x in range(len(isoPep) - K))) pep_find = re.compile( '((^M?)|[KR](?=[^P]))%s(((?<=[KR])[^P])|$)' % isoPep) isoPepToProts[isoPep] = set( prot for prot in iso_candidate_prots if pep_find.search(isoSeqLookup[prot])) proteins = '; '.join(pepToProts[pep]) proteinCount = len(pepToProts[pep]) geneList = set(geneLookup[x] for x in pepToProts[pep] if x in geneLookup) geneIds = '; '.join(set(g for g in geneList)) #geneSymbols = '; '.join(set(s for _, s in geneList)) geneCount = len(geneList) row[colname['Protein Count']] = proteinCount row[colname['Proteins']] = proteins row[colname['Gene Count']] = geneCount row[colname['Gene Symbols']] = geneIds #row[colname['Gene IDs']] = if leucine_equals_isoleucine: isoProteins = '; '.join(isoPepToProts[isoPep]) isoProteinCount = len(isoPepToProts[isoPep]) isoGeneList = set(geneLookup[x] for x in isoPepToProts[isoPep] if x in geneLookup) isoGeneIds = '; '.join(set(g for g in isoGeneList)) #isoGeneSymbols = '; '.join(set(s for _, s in isoGeneList)) isoGeneCount = len(isoGeneList) row[colname['I<->L Protein Count']] = isoProteinCount row[colname['I<->L Proteins']] = isoProteins row[colname['I<->L Gene Count']] = isoGeneCount row[colname['I<->L Gene Symbols']] = isoGeneIds #row[colname['I<->L Gene IDs']] = output.write(row) print("\nGene lookup completed: %.2f" % (time.clock() - prevtime)) prevtime = time.clock() rdr.close() output.close() print("Output written: %.2f" % (time.clock() - prevtime)) outputfiles.append(outputfile) if return_list: return outputfiles else: return outputfile
def legacy_get_reports(self, mascot_ids, dates=None, chosen_folder=None, combined_file=False, rank_one=False, protein_report=False, mascot_options=None, peaks=False, peaks_options=None, mascot_web=False, mascot_web_options=None, mascot_prot_cov=False, ext='.xlsx', local_dat_files=None, mascotIDInResultName=False, percolatorDirectory=None, **kwargs): # mascot_ids should be a list/tuple of IDs. dates should be a matching list/tuple of dates, # or False. combined_file should be None for individual files or an output file name # mascot options: (max_hits, ion_cutoff, bold_red, unassigned_queries, # show_query_data, show_same_set, show_sub_set, quant) + mascot_id, date # mascot_web options: (ms2_img, mascot_ms2, mascot_var_mods, # draw_pep, instrument, im_size) + mascot_id, date # mascot_prot_cov options: ion_cutoff, mascot_id, date # defaults and overrides. The priority is: keyword > option_dict > default # Using local .DATs means you don't have access to certain fancy # Mascot features. if local_dat_files: mascot_web = False mascot_prot_cov = False assert not peaks, ( "precursor_peaks and images in result files are no longer supported; " "peaks argument to get_reports must be False.") # defaults _mascot_options = dict(max_hits=1000, ion_cutoff=20, bold_red=True, unassigned_queries=False, show_query_data=True, show_same_set=False, show_sub_set=False, quant=False) # option_dict if mascot_options: _mascot_options.update(mascot_options) # keywords _mascot_options.update( (k, kwargs[k]) for k in kwargs if k in _mascot_options) for k in _mascot_options: if k in kwargs: _mascot_options[k] = kwargs[k] if peaks: # defaults _peaks_options = dict(time_window=(0.5, 0.5), mz_window=(0.1, 0.1), plot_ms1=False, plot_xic=False, plot_ms2=False, peak_area=False, reporter_ions=False, peakfilter=None, ion_list=['b', 'y'], instrument='ESI-TRAP', im_size=(8.0, 6.0)) # option_dict if peaks_options: _peaks_options.update(peaks_options) # keywords _peaks_options.update( (k, kwargs[k]) for k in kwargs if (k in _peaks_options or k == 'peak_data_path')) # need a path (file or directory) to actually do this, # so we raise an exception if it's not present if 'peak_data_path' not in _peaks_options: raise ValueError( 'peak_data_path value is required for peak extraction') if mascot_web: # defaults _mascot_web_options = dict(ms2_img=True, mascot_ms2=True, mascot_var_mods=True, instrument='ESI-TRAP', im_size=(8.0, 6.0)) # option_dict if mascot_web_options: _mascot_web_options.update(mascot_web_options) # keywords _mascot_web_options.update( (k, kwargs[k]) for k in kwargs if k in _mascot_web_options) # if version is 2.2+, mod positions are extracted automatically if mascot_web and self.mascot.version >= '2.2': _mascot_web_options['mascot_var_mods'] = False if not _mascot_web_options['ms2_img']: mascot_web = False # require agreement 'instrument' between two dictionaries if peaks and mascot_web: if _peaks_options['instrument'] != _mascot_web_options[ 'instrument']: raise ValueError( 'instrument value must be consistent; input dictionaries disagree' ) # Getting both of these would be redundant, so force at most one if mascot_web and peaks and _mascot_web_options['ms2_img']: _peaks_options['plot_ms2'] = False if chosen_folder is None: chosen_folder = myData # if creating a single file, we'll create the writer now if combined_file: # figuring out the report columns. start with defaults... repcols = mzReport.default_columns[:] # mascot 2.3 can have multiple databases so add a column for that if self.mascot.version >= '2.3': repcols.insert(1, 'Protein Database') # these are the columns coming out of the dat file, need them separate res_cols = repcols[:] # add columns for peak extraction if peaks: repcols.extend(c for c in [ 'MS2 Time', 'Peak Time', 'Peak Intensity', 'Peak Width (sec)', 'Peak Comment' ] if c not in repcols) if _peaks_options['peak_area'] and 'Peak Area' not in repcols: repcols.append('Peak Area') if _peaks_options['reporter_ions']: repcols.extend( c for c in ['Rep114', 'Rep115', 'Rep116', 'Rep117'] if c not in repcols) repcols.insert(0, 'File') report_file = os.path.join(chosen_folder, combined_file) if os.path.exists(report_file): os.remove(report_file) report = mzReport.writer(report_file, columns=repcols) isMZD = isinstance(report, mzReport.mzDB.SQLiteWriter) mascot_headers = [] else: report_files = [] if dates: mid_d = zip(mascot_ids, dates, [None] * len(mascot_ids)) elif local_dat_files: mid_d = zip(["Local File"] * len(local_dat_files), [None] * len(local_dat_files), local_dat_files) else: mid_d = [(mid, None, None) for mid in mascot_ids] for mascot_id, date, local in mid_d: mascot_id = str(mascot_id) if ':' in mascot_id: (mascot_id, date) = mascot_id.split(':', 1) mascot_id = str(mascot_id).zfill(6) if not (date or local): date = self.mascot.get_date(mascot_id) logger_message( 30, 'Generating Multiplierz-Mascot Report for JobID %s...' % mascot_id) if ext == '.mzid': logger_message(30, 'Downloading mzIdentML File...') destination = chosen_folder if chosen_folder else myData reportfilename = "F%s.mzid" % mascot_id outputfile = os.path.join(destination, reportfilename) report_file = self.mascot.download_mzid(mascot_id, save_file=outputfile, date=date) assert report_file == outputfile report_files.append(report_file) continue # mzIdentML files don't use the rest of this function; # what they contain is essentially fixed, to multiplierz. if not local: logger_message(30, 'Downloading Mascot DAT File...') dat_file = self.mascot.download_dat(chosen_folder, mascot_id, date) else: dat_file = os.path.abspath(local) mascot_id = os.path.basename(local).split('.')[0] if dat_file: logger_message(20, 'Mascot DAT File Downloaded!') mascot_dat_file = interface.MascotDatFile( dat_file, **_mascot_options) if percolatorDirectory and mascot_dat_file.hasDecoyHits(): print "Running Mascot Percolator..." mascot_dat_file.close() percolatedDatFile = runPercolator(dat_file, percolatorDirectory) mascot_dat_file = interface.MascotDatFile( dat_file, **mascot_options) if self.cleanup: os.remove(dat_file) dat_file = percolatedDatFile else: logger_message( 40, 'Failed to download DAT file for %s' % mascot_id) continue if self.mascot.version != mascot_dat_file.res_file.getMascotVer( )[:len(self.mascot.version)]: print( "Mascot version mismatch detected; changing version from %s to %s" % (self.mascot.version, mascot_dat_file.res_file.getMascotVer() [:len(self.mascot.version)])) self.mascot.version = mascot_dat_file.res_file.getMascotVer( )[:len(self.mascot.version)] if not combined_file: # Report column stuff moved from above, in order to handle version dependency. (Heavy sigh.) # figuring out the report columns. start with defaults... repcols = mzReport.default_columns[:] # mascot 2.3 can have multiple databases so add a column for that if self.mascot.version >= '2.3': repcols.insert(1, 'Protein Database') # these are the columns coming out of the dat file, need them separate res_cols = repcols[:] # add columns for peak extraction if peaks: repcols.extend(c for c in [ 'MS2 Time', 'Peak Time', 'Peak Intensity', 'Peak Width (sec)', 'Peak Comment' ] if c not in repcols) if _peaks_options[ 'peak_area'] and 'Peak Area' not in repcols: repcols.append('Peak Area') if _peaks_options['reporter_ions']: repcols.extend( c for c in ['Rep114', 'Rep115', 'Rep116', 'Rep117'] if c not in repcols) if mascot_prot_cov: repcols.append('Protein Coverage') #Get MS File Name mascot_header = mascot_dat_file.mascot_header() ms_file_name = mascot_header[7][1] or ('F%s' % mascot_id) if not combined_file: filename = os.path.basename(ms_file_name) if mascotIDInResultName and filename.endswith('.mgf'): filename = filename[:-4] + "." + mascot_id report_file = os.path.join(chosen_folder, filename + ext) if os.path.exists(report_file): os.remove(report_file) report = mzReport.writer(report_file, columns=repcols) isMZD = isinstance(report, mzReport.mzDB.SQLiteWriter) if mascot_web and (_mascot_web_options['ms2_img'] or _mascot_web_options['mascot_var_mods']): gen_options = {} try: gen_options['ms2_img'] = _mascot_web_options['ms2_img'] except KeyError: pass try: gen_options['mascot_var_mods'] = _mascot_web_options[ 'mascot_var_mods'] except KeyError: pass mascot_web_gen = self.mascot_web( mascot_id, date=date, dat_file=(dat_file if _mascot_web_options['mascot_ms2'] else None), isMZD=isMZD, **gen_options) mascot_web_gen.next() if mascot_prot_cov: prot_cov_gen = self.mascot_prot_coverage( mascot_id, _mascot_options['ion_cutoff'], date) prot_cov_gen.next() prot_desc_dict = {} if self.mascot.version != mascot_dat_file.res_file.getMascotVer( )[:len(self.mascot.version)]: raise TypeError, "Incorrect version of Mascot selected. %s %s" % ( self.mascot.version, mascot_dat_file.res_file.getMascotVer( )[:len(self.mascot.version)]) missing_desc_count = 0 for row in mascot_dat_file.peptide_report(): row = mzReport.ReportEntry(res_cols, row) if rank_one and row['Peptide Rank'] != 1: continue if (not local) and not (row['Protein Description'] or row['Protein Mass']): if row['Accession Number'] not in prot_desc_dict: missing_desc_count += 1 # Very slow! #prot_desc_dict[row['Accession Number']] = self.mascot.get_description(row['Accession Number'], #row.get('protein database', '1').split('::')[0], #mascot_id, #date) row['Protein Description'], row[ 'Protein Mass'] = prot_desc_dict.get( row['Accession Number'], ('-', '-')) md = [] #if peaks: #(new_row, img_tuples) = peak_gen.send(row) #row.update(new_row) #md.extend(img_tuples) if mascot_web and (_mascot_web_options['ms2_img'] or _mascot_web_options['mascot_var_mods']): (vartext, img_tup) = mascot_web_gen.send(row) if _mascot_web_options['mascot_var_mods']: row['Variable Modifications'] = vartext if _mascot_web_options['ms2_img']: md.append(img_tup) if mascot_prot_cov: (prot_cov, md_tup) = prot_cov_gen.send(row) row['Protein Coverage'] = prot_cov md.append(md_tup) if combined_file: row['File'] = ms_file_name report.write(row, metadata=md) if missing_desc_count: print "Missing protein info for %d PSMs." % missing_desc_count if peaks: peak_gen.close() if mascot_web and (_mascot_web_options['ms2_img'] or _mascot_web_options['mascot_var_mods']): mascot_web_gen.close() if mascot_prot_cov: prot_cov_gen.close() # Mascot-decoy-data finder! if mascot_dat_file.hasDecoyHits(): decoy_dat_file = interface.MascotDatFile(dat_file, decoyMode=True, **_mascot_options) for row in decoy_dat_file.peptide_report(): report.write(row) decoy_dat_file.close() if not combined_file: if os.path.splitext(report_file)[1].lower() in ('.xls', '.xlsx', '.mzd'): report.close() self.mascot_headers(report_file, [(None, mascot_header)]) if protein_report: self.prot_report(report_file, mascot_dat_file.protein_report()) else: report.close() report_files.append(report_file) else: if os.path.splitext(report_file)[1].lower() in ('.xls', '.xlsx', '.mzd'): mascot_headers.append((ms_file_name, mascot_header)) mascot_dat_file.close() if self.cleanup and not local_dat_files: os.remove(dat_file) logger_message( 30, 'Multiplierz-Mascot Report for JobID %s Generated!' % mascot_id) if combined_file: if os.path.splitext(report_file)[1].lower() in ('.xls', '.xlsx', '.mzd'): report.close() self.mascot_headers(report_file, mascot_headers) # not supported right now: protein reports for XLS. if isMZD and protein_report: self.prot_report(report_file, None) else: report.close() return [report_file] if combined_file else report_files