def write_fdr_outputs(inFile, specFile, pepFile, protFile): smf.print_milestone('Generating FDR Analysis Files:') overallDf = pd.read_csv(inFile).sort_values( 'MaCC_Score', ascending=False).reset_index(drop=True) spectralDf = add_fdr_to_csodiaq_output(overallDf) peptideDf = add_fdr_to_csodiaq_output(overallDf, filterType='peptide') spectralDf.to_csv(specFile, index=False) peptideDf.to_csv(pepFile, index=False) if protFile: peptideProteinConnections = format_peptide_protein_connections( peptideDf) verifiedProteinDict = idp.find_valid_proteins( peptideProteinConnections) proteinDf = add_leading_protein_column(peptideDf, verifiedProteinDict) tempProtDf = add_fdr_to_csodiaq_output(proteinDf, filterType='leadingProtein') proteinMetaInfoDict = tempProtDf.set_index( 'leadingProtein').T.to_dict() proteinDf = remove_invalid_peptides_and_add_metadata( proteinDf, proteinMetaInfoDict) proteinDf = mark_peptides_unique_to_proteins(proteinDf) proteinDf.to_csv(protFile, index=False)
def mgf_library_upload(fileName): libMGF = mgf.read(fileName) smf.print_milestone('Enter library dictionary upload: ') lib = {} id = 0 for spec in libMGF: id += 1 key = (spec['params']['pepmass'][0], spec['params']['seq']) charge = int(re.sub('[+-]', '', str(spec['params']['charge'][0]))) name = spec['params']['title'] if 'protein' in spec['params']: protein = spec['params']['protein'] else: protein = '' if 'DECOY' in name: decoy = 1 else: decoy = 0 mz = spec['m/z array'] intensity = spec['intensity array'] intensity = [x**0.5 for x in intensity] keyList = [id for x in mz] peaks = list(tuple(zip(mz, intensity, keyList))) peaks.sort(key=lambda x: x[1], reverse=True) if len(peaks) > 10: peaks = peaks[:10] peaks.sort(key=lambda x: x[0]) tempDict = { 'PrecursorCharge': charge, 'transition_group_id': name, 'ProteinName': protein, 'Peaks': peaks, 'ID': id, 'Decoy': decoy, } lib[key] = tempDict return lib
def filter_fdr_output_for_targeted_reanalysis(fdrFile, proteins, heavy): smf.print_milestone('Generate DISPA Targeted Reanalysis Files:') fdrDf = pd.read_csv(fdrFile) fdrDf = fdrDf[~fdrDf['protein'].str.contains('DECOY', na=False )].reset_index(drop=True) if heavy: fdrDf = fdrDf[fdrDf['peptide'].str.endswith('R') | fdrDf['peptide'].str.endswith('K')].reset_index( drop=True) if proteins: fdrDf = fdrDf[fdrDf['uniquePeptide'] == 1].sort_values( 'ionCount', ascending=False).reset_index(drop=True) fdrDf = fdrDf.groupby(['leadingProtein']).head(proteins).reset_index() return fdrDf
def connect_mzxml_to_csodiaq_and_library(idFile, libFile, mzxmlFiles, maxPeaks): smf.print_milestone('Preparing Quantification Dictionaries:') metadataToScanDict = create_mzxml_to_csodiaq_dict(mzxmlFiles[0]) fileType = idFile.split('.')[-1] if fileType == 'csv': fragDf = pd.read_csv(idFile) else: fragDf = pd.read_csv(idFile, sep='\t') scanToCsodiaqDict, libMetadataToScanDict = connect_csodiaq_data_to_scans( idFile, metadataToScanDict, fragDf) scanToLibPeaksDict = pool_library_spectra_by_scan(libFile, libMetadataToScanDict, fragDf, maxPeaks) return scanToCsodiaqDict, scanToLibPeaksDict
def heavy_light_quantification(fragDict, libDict, mzxmlFiles, outDir, massTol, minMatch, ratioType, correction, hist): finalDf = initialize_quantification_output(fragDict, libDict) def initialize_ratio_dict_values(): return np.nan for f in mzxmlFiles: ppmDiffs = [] allSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher( ) scanToNoiseIntensityCutoffDict = dict() with mzxml.read(f, use_index=True) as file: for scan in sorted(libDict.keys()): spec = file.get_by_id(scan) scanToNoiseIntensityCutoffDict[int(scan)] = np.mean( sorted(spec['intensity array'])[:10]) / 2 expSpectrum = smf.format_spectra_for_pooling(spec, scan, sqrt=False) expSpectrum.sort() libSpectra = sorted(libDict[scan]) quantSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher( ) quantSpectraMatch.compare_spectra(libSpectra, expSpectrum, massTol, minMatch) allSpectraMatch.extend_all_spectra(quantSpectraMatch) if correction != -1: allSpectraMatch.filter_by_corrected_ppm_window( correction, hist, minMatch) ratioDict = defaultdict(initialize_ratio_dict_values) if len(allSpectraMatch.libraryIntensities) != 0: ratioDict = allSpectraMatch.determine_ratios( ratioDict, scanToNoiseIntensityCutoffDict, ratioType, minMatch) finalDf[f] = [ ratioDict[(int(row['scan']), row['peptide'])] for index, row in finalDf.iterrows() ] smf.print_milestone('Finish SILAC Quantification') return finalDf
def perform_spectra_pooling_and_analysis(querySpectraFile, outFile, lib, tolerance, maxQuerySpectraToPool, corrected, histFile): smf.print_milestone('Begin Grouping Scans by m/z Windows:') queWindowDict, queScanValuesDict = pool_scans_by_mz_windows( querySpectraFile) print('Number of Unpooled MS/MS Query Spectra: ' + str(len(queScanValuesDict))) print('Number of Pooled MS/MS Query Spectra/Mz Windows: ' + str(len(queWindowDict)), flush=True) # To enhance the print experience, status prints will be given at intervals tailored to the number of identified windows. # example: if there are 1-99 pooled query spectra, print statements are made after every pooled query spectra analysis is complete. # if there are 100-999, print after every 10 pooled spectra. And so on. printFriendlyCounter = 100 while printFriendlyCounter < len(queWindowDict): printFriendlyCounter *= 10 printFriendlyCounter /= 100 allLibKeys, libIdToKeyDict, libIdToDecoyDict = gather_library_metadata(lib) allSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher( ) numWindowsAnalyzed = 0 prevtime = timer() smf.print_milestone('Begin Pooled Spectra Analysis:') with mzxml.read(querySpectraFile, use_index=True) as spectra: for precMz_win, scans in queWindowDict.items(): top_mz = precMz_win[0] + precMz_win[1] / 2 bottom_mz = precMz_win[0] - precMz_win[1] / 2 libKeys = identify_lib_spectra_in_window(top_mz, bottom_mz, allLibKeys) if len(libKeys) == 0: continue pooledLibSpectra = pool_lib_spectra(lib, libKeys) pooledQueSpectra = [] for i in range(len(scans)): scanNumber = scans[i] queSpectrum = spectra.get_by_id(scanNumber) pooledQueSpectra += smf.format_spectra_for_pooling( queSpectrum, scanNumber) if (i % maxQuerySpectraToPool == 0 and i != 0) or i == len(scans) - 1: pooledQueSpectra.sort() windowSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher( ) windowSpectraMatches.compare_spectra( pooledLibSpectra, pooledQueSpectra, tolerance, libIdToDecoyDict) allSpectraMatches.extend_all_spectra(windowSpectraMatches) pooledQueSpectra.clear() numWindowsAnalyzed += 1 if numWindowsAnalyzed % printFriendlyCounter == 0: time = timer() print('\nNumber of Pooled Experimental Spectra Analyzed: ' + str(numWindowsAnalyzed)) print('Number of Spectra in Current Pooled Spectra: ' + str(len(scans))) print('Time Since Last Checkpoint: ' + str(round(time - prevtime, 2)) + ' Seconds', flush=True) prevtime = time smf.print_milestone('Begin FDR Analysis:') maccCutoff = allSpectraMatches.find_score_fdr_cutoff() if corrected != -1: smf.print_milestone('Begin Correction Process:') allSpectraMatches.filter_by_corrected_ppm_window( corrected, maccCutoff, histFile) smf.print_milestone('Begin Corrected FDR Analysis:') maccCutoff = allSpectraMatches.find_score_fdr_cutoff() smf.print_milestone('\nBegin Writing to File: ') allSpectraMatches.write_output(outFile, querySpectraFile, maccCutoff, queScanValuesDict, libIdToKeyDict, lib)
def traml_library_upload(fileName): if fileName.endswith('.tsv'): lib_df = pd.read_csv(fileName, sep='\t') else: lib_df = pd.read_csv(fileName) smf.print_milestone('Enter library dictionary upload: ') # Pan human and spectraST libraries have different column names. This normalizes the columns. headings = traml_column_headings(lib_df.columns) lib_df = lib_df.loc[:, lib_df.columns.intersection([ headings['PrecursorMz'], headings['FullUniModPeptideName'], headings['PrecursorCharge'], headings['ProductMz'], headings['LibraryIntensity'], headings[ 'transition_group_id'], headings['ProteinName'] ])] lib_df = lib_df[[ headings['PrecursorMz'], headings['FullUniModPeptideName'], headings['PrecursorCharge'], headings['ProductMz'], headings['LibraryIntensity'], headings['transition_group_id'], headings['ProteinName'] ]] lib_df.columns = [ 'PrecursorMz', 'FullUniModPeptideName', 'PrecursorCharge', 'ProductMz', 'LibraryIntensity', 'transition_group_id', 'ProteinName' ] lib_df['LibraryIntensity'] = [ x**0.5 for x in list(lib_df['LibraryIntensity']) ] lib_df['ID'] = list( zip(lib_df['PrecursorMz'].tolist(), lib_df['FullUniModPeptideName'].tolist())) mz_dict = lib_df.groupby("ID")['ProductMz'].apply(list).to_dict() intensity_dict = lib_df.groupby("ID")['LibraryIntensity'].apply( list).to_dict() lib_df.drop_duplicates(subset="ID", inplace=True) lib_df = lib_df.loc[:, lib_df.columns.intersection([ 'ID', 'PrecursorCharge', 'transition_group_id', 'ProteinName' ])] lib_df.set_index("ID", drop=True, inplace=True) lib = lib_df.to_dict(orient="index") # pan human library formats are different, including how the peptides are matched to proteins (esp. decoys). This section of code adjusts for this discrepancy. if headings['type'] == 'PanHuman': for key, value in lib.items(): proteins = lib[key]['ProteinName'].split('/') num = proteins.pop(0) newProteins = [x for x in proteins if 'DECOY' not in x] proteinStr = str(len(newProteins)) for x in newProteins: if 'DECOY' in num: proteinStr += ('/DECOY_' + x) else: proteinStr += ('/' + x) lib[key]['ProteinName'] = proteinStr id = 0 for key in lib: id += 1 mz, intensity = (list(t) for t in zip( *sorted(zip(mz_dict[key], intensity_dict[key])))) keyList = [id for i in range(len(mz))] peaks = list(tuple(zip(mz, intensity, keyList))) peaks.sort(key=lambda x: x[1], reverse=True) if len(peaks) > 10: peaks = peaks[:10] peaks.sort(key=lambda x: x[0]) lib[key]['Peaks'] = peaks lib[key]['ID'] = id if 'DECOY' in lib[key]['ProteinName']: lib[key]['Decoy'] = 1 else: lib[key]['Decoy'] = 0 return lib