def adjust_prec_mz(mgf_file, error, out_path): outfile = os.path.join(out_path, 'recal_' + os.path.split(mgf_file)[1]) if not os.path.exists(out_path): os.makedirs(out_path) elif os.path.isfile(outfile): raise Exception('File %s already exists!' % outfile) ms2_spectra = ProteoFileReader.read_mgf(mgf_file) for spectrum in ms2_spectra: spectrum.pepmz = spectrum.getPrecursorMZ() / (1 - error / 10.0 ** 6) ProteoFileReader.write_mgf(ms2_spectra, outfile)
def generate_cihcd_spectra(mzml_file): """ """ mzml_reader = mzml.read(mzml_file) cihcd_spectra = [] n = 0 for spectrum in mzml_reader: if spectrum['ms level'] == 3: n += 1 filter_str = spectrum['scanList']['scan'][0]['filter string'] try: detector_str = re.search("^(FT|IT)", filter_str).groups()[0] frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str) precursor_mz_groups = re.findall("([0-9.]+)@", filter_str) except AttributeError: raise StandardError("filter string parse error: %s" % filter_str) ms2_id = spectrum['precursorList']['precursor'][0]['spectrumRef'] title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id'] + " ms2_scanId=" + ms2_id rt = spectrum['scanList']['scan'][0]['scan start time'] * 60 pre_mz = precursor_mz_groups[0] # take ms2 precursor as precursor pre_int = -1 pre_z = -1 peaks = zip(spectrum['m/z array'], spectrum['intensity array']) ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks) cihcd_spectra.append(ms2class_spectrum) return cihcd_spectra
def deisotope_spectra(self, infile, in_type="MGF", n_jobs=-1, return_type="spectrum", show_progress=False): """ Function to deisotope spectra """ #process a MGF if in_type == "MGF": MGF_file = PFR.MGF_Reader() MGF_file.load(infile) results_store = Parallel(n_jobs=n_jobs)\ (delayed(self.parallel_helper)(spectrum, return_type, show_progress, ii) for ii, spectrum in enumerate(MGF_file)) elif in_type.lower() == "mzml": mzml_file = oms.MzMLFile() exp = oms.MSExperiment() mzml_file.load(infile, exp) #get the MS2 spectra spectra_PFR = [] for spectrum in exp: if spectrum.getMSLevel() == 2: spectra_PFR.append( PFR.MS2_spectrum( spectrum.getNativeID(), spectrum.getRT(), spectrum.getPrecursors()[0].getMZ(), spectrum.getPrecursors()[0].getIntensity(), spectrum.getPrecursors()[0].getCharge(), np.matrix(spectrum.get_peaks()).transpose())) results_store = Parallel(n_jobs=n_jobs)\ (delayed(self.parallel_helper)(spectrum, return_type, show_progress, ii) for ii, spectrum in enumerate(spectra_PFR)) else: print("In type is not supported.") sys.exit() if return_type == "df": results_store_df = pd.concat(results_store) return (results_store_df) else: return (results_store)
def adjust_prec_mz(mgf_file, ms1_error, ms2_error, outpath): outfile = os.path.join(outpath, 'recal_' + os.path.split(mgf_file)[1]) if not os.path.exists(outpath): os.makedirs(outpath) elif os.path.isfile(outfile): raise Exception('File %s already exists!' % outfile) ms2_spectra = ProteoFileReader.read_mgf(mgf_file) for spectrum in ms2_spectra: # ms1/precursor correction spectrum.pepmz = spectrum.getPrecursorMZ() / ( 1 + ms1_error / 10.0**6) # TODO wrong sign if newer version # ms2 peak correction ms2_peaks = spectrum.getPeaks() for i in range(0, len(ms2_peaks)): ms2_peaks[i][0] = ms2_peaks[i][0] / (1 + ms2_error / 10.**6) spectrum.peaks = ms2_peaks ProteoFileReader.write_mgf(ms2_spectra, outfile)
def process_file(filepath, outdir, mscon_settings, split_acq, detector_filter, mscon_exe, cihcd_ms3=False): #TODO implement option further up if not os.path.exists(outdir): os.makedirs(outdir) conv_cmds = mscon_cmd(filepath=filepath, outdir=outdir, settings=mscon_settings, mgf=not split_acq) if len(conv_cmds) > 0: msconvert = subprocess.Popen([mscon_exe] + conv_cmds) msconvert.communicate() filename = os.path.split(filepath)[1] mzml_file = os.path.join(outdir, filename[:filename.rfind('.')] + '.mzML') if cihcd_ms3: cihcd_spectra = generate_cihcd_spectra(mzml_file) ProteoFileReader.write_mgf( spectra=cihcd_spectra, outfile=os.path.join( outdir, 'CIhcD_ms3_' + filename[:filename.rfind('.')] + '.mgf')) if split_acq: split_spectra = mzml_to_MS2_spectra(mzml_file, detector_filter) ProteoFileReader.write_mgf( spectra=split_spectra, outfile=os.path.join(outdir, filename[:filename.rfind('.')] + '.mgf'))
def adjust_prec_mz(mgf_file, error, outpath): outfile = os.path.join(outpath, 'recal_' + os.path.split(mgf_file)[1]) if not os.path.exists(outpath): os.makedirs(outpath) elif os.path.isfile(outfile): return exp = ProteoFileReader.MGF_Reader() exp.load(mgf_file) out_writer = open(os.path.join(outfile), "w") for spectrum in exp: prec_mz_new = spectrum.getPrecursorMass() / (1 - error / 10.**6) if sys.version_info.major < 3: stavrox_mgf = """ MASS=Monoisotopic BEGIN IONS TITLE={} PEPMASS={} {} CHARGE={}+ RTINSECONDS={} {} END IONS """.format( spectrum.getTitle(), prec_mz_new, spectrum.getPrecursorIntensity() if spectrum.getPrecursorIntensity() > 0 else 0, int(spectrum.charge), spectrum.getRT(), "\n".join([ "%s %s" % (i[0], i[1]) for i in spectrum.peaks if i[1] > 0 ])) else: stavrox_mgf = """ MASS=Monoisotopic BEGIN IONS TITLE={} PEPMASS={} {} CHARGE={}+ RTINSECONDS={} {} END IONS """.format( spectrum.getTitle(), prec_mz_new, spectrum.getPrecursorIntensity() if spectrum.getPrecursorIntensity() > 0 else 0, int(spectrum.charge), spectrum.getRT(), "\n".join([ "%s %s" % (mz, spectrum.peaks[1][i]) for i, mz in enumerate(spectrum.peaks[0]) if spectrum.peaks[1][i] > 0 ])) out_writer.write(stavrox_mgf)
def test(): #%% infile = "data/mscon_PF_20_100_0_B160803_02.mzML" exp = PFR.mzMLReader(infile) print(exp) infile = "data/test.mgf" infile = "data/mscon_PF_20_100_0_B160803_02.mgf" deisotoper = de.Deisotoper() deisotoped_spectra = deisotoper.deisotope_spectra(infile, show_progress=True, n_jobs=-1) #%% spectrum = pd.read_csv("data/test.dta2d", sep="\t") mz = spectrum["MZ"].values intensity = spectrum["INT"].values deisotoper = de.Deisotoper() G = deisotoper.spec2graph(mz, intensity) cluster_ar = deisotoper.extract_isotope_cluster(G, verbose=False) cluster_dic_resolved = deisotoper.resolve_ambiguous( cluster_ar, mz, intensity) cluster_df = de.deisotoper.assemble_spectrum(cluster_dic_resolved, mz, intensity, 1337) n_accepted = sum([i[0] for i in cluster_dic_resolved.values()]) n_rejected = len(cluster_dic_resolved) - n_accepted #%% #plt.bar(mz, intensity) verbose = False deisotoper = de.Deisotoper() # ========================================================================= # # ========================================================================= #example 1 - pseudo overlapping - charge 2 but 1 also possible mz = np.array([0, 100, 250, 700, 700.5, 701, 701.5, 800]) z_test = 2 exp1 = AM.averagine_model(700 * z_test, n_peaks=4, only_intensity=True) * 100 intensity = np.array([10, 10, 10, exp1[0], exp1[1], exp1[2], exp1[3], 10]) GT = {} GT["ratio"] = "1:0" GT["Case"] = "test 1 - only one true isotope cluster" GT["TestID"] = "1" G = deisotoper.spec2graph(mz, intensity) de.plot_graph(G) cluster_dic = deisotoper.extract_isotope_cluster(G, verbose) cluster_dic_resolved = deisotoper.resolve_ambiguous( cluster_dic, mz, intensity) #plt.bar(mz, intensity, width=0.2) #plt.xlim(699, 702) #%% # ========================================================================= # example 2 - overlapping charge 1 and 2 # ========================================================================= verbose = True deisotoper = de.Deisotoper() a = time.time() mz = np.array([0, 100, 250, 300, 500, 501, 501.5, 502, 503]) exp1 = AM.averagine_model( 500, n_peaks=4, only_intensity=True) * 100 * 3 #+ #np.random.normal(0, 10, 4) exp2 = AM.averagine_model( 501.5 * 2, n_peaks=3, only_intensity=True) * 100 * 6 #+ np.random.normal(0, 10, 3) intensity = np.array([ 10, 10, 10, 10, exp1[0], exp1[1] + exp2[0], exp2[1], exp1[2] + exp2[2], exp1[3] ]) GT = {} GT["ratio"] = "1:2" GT["Case"] = "test 2 - Overlapping different charge" GT["TestID"] = "2" G = deisotoper.spec2graph(mz, intensity) de.plot_graph(G) cluster_ar = deisotoper.extract_isotope_cluster(G, verbose) cluster_dic_resolved = deisotoper.resolve_ambiguous(cluster_ar, mz, intensity, verbose=verbose, GT=GT) print(cluster_ar) b = time.time() took = (b - a) / 60. print(took * 30000)
def split_mzml(mzml_file, detector="all"): """ function to split a mzML file into dict of MS2_Spectra objects (can be written to mgf format) by fragmentation method Parameters: ----------------------------------------- mzml_file: str, path to mzML file Return: dict {fragMethod: list(MS2_spectrum) """ mzml_reader = mzml.read(mzml_file) ordered_ms2_spectra = { "CID": [], "HCD": [], "ETD": [], "ETciD": [], "EThcD": [], "unknown": [] } n = 0 for spectrum in mzml_reader: if spectrum['ms level'] == 2: n += 1 filter_str = spectrum['scanList']['scan'][0]['filter string'] try: detector_str = re.search("^(FT|IT)", filter_str).groups()[0] frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str) except AttributeError: raise StandardError("filter string parse error: %s" % filter_str) if not detector == "all": if not detector == detector_str: continue title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id'] rt = spectrum['scanList']['scan'][0]['scan start time'] * 60 precursor = spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0] pre_mz = precursor['selected ion m/z'] try: pre_int = precursor['peak intensity'] except KeyError: pre_int = 0 pre_z = precursor['charge state'] peaks = zip(spectrum['m/z array'], spectrum['intensity array']) ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks) frag_methods = [f[0] for f in frag_groups] if "etd" in frag_methods: if "cid" in frag_methods: ordered_ms2_spectra['ETciD'].append(ms2class_spectrum) elif "hcd" in frag_methods: ordered_ms2_spectra['EThcD'].append(ms2class_spectrum) else: ordered_ms2_spectra['ETD'].append(ms2class_spectrum) elif "cid" in frag_methods: ordered_ms2_spectra['CID'].append(ms2class_spectrum) elif "hcd" in frag_methods: ordered_ms2_spectra['HCD'].append(ms2class_spectrum) else: ordered_ms2_spectra['unknown'].append(ms2class_spectrum) if len(ordered_ms2_spectra['unknown']) > 0: raise Warning("The fragmentation method of %i spectra could not be identified" % len(ordered_ms2_spectra['unknown'])) return {k: v for k, v in ordered_ms2_spectra.items() if len(v) > 0}
def mzml_to_MS2_spectra(mzml_file, detector_filter="all"): """ function to split a mzML file into a list of MS2_Spectra objects (can be written to mgf format) with fragmentation method and detector type Parameters: ----------------------------------------- mzml_file: str, path to mzML file detector_filter: filter scans by detector type ('all', 'FT', 'IT') Return: list(MS2_spectrum) """ mzml_reader = mzml.read(mzml_file) sorted_ms2_spectra = [] unknown_frag_method_count = 0 n = 0 for spectrum in mzml_reader: if spectrum['ms level'] == 2: n += 1 filter_str = spectrum['scanList']['scan'][0]['filter string'] try: detector_str = re.search("^(FT|IT)", filter_str).groups()[0] frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str) except AttributeError: raise Exception("filter string parse error: %s" % filter_str) if not detector_filter == "all": if not detector_filter == detector_str: continue title = os.path.split(mzml_file)[1].split( '.mzML')[0] + " " + spectrum['id'] rt = spectrum['scanList']['scan'][0]['scan start time'] * 60 precursor = spectrum['precursorList']['precursor'][0][ 'selectedIonList']['selectedIon'][0] pre_mz = precursor['selected ion m/z'] try: pre_int = precursor['peak intensity'] except KeyError: pre_int = 0 pre_z = precursor['charge state'] peaks = zip(spectrum['m/z array'], spectrum['intensity array']) frag_methods = [f[0] for f in frag_groups] if "etd" in frag_methods: if "cid" in frag_methods: frag_method = "ETciD" elif "hcd" in frag_methods: frag_method = "EThcD" else: frag_method = "ETD" elif "cid" in frag_methods: frag_method = "CID" elif "hcd" in frag_methods: frag_method = "HCD" else: frag_method = 'unknown' unknown_frag_method_count += 1 ms2class_spectrum = ProteoFileReader.MS2_spectrum( title, rt, pre_mz, pre_int, pre_z, peaks, detector=detector_str, fragmethod=frag_method) sorted_ms2_spectra.append(ms2class_spectrum) if unknown_frag_method_count > 0: raise Warning( "The fragmentation method of %i spectra could not be identified" % unknown_frag_method_count) return sorted_ms2_spectra
if not os.path.exists(outdir): os.makedirs(outdir) # TODO change to parallel with manual input of error for inputfile in mgf_file_list: if 'ms3' in os.path.split(inputfile)[1]: continue if ms2recal: mass_recal_ms2.main(fasta=recal_conf['db'], xi_cnf=recal_conf['xiconf'], outpath=outdir, mgf=inputfile, threads=str(nthr), val_input=recal_conf['shift_csv']) else: mass_recal.main(fasta=recal_conf['db'], xi_cnf=recal_conf['xiconf'], outpath=outdir, mgf=inputfile, threads=str(nthr), val_input=recal_conf['shift_csv']) mgf_file_list = [ os.path.join(os.path.split(x)[0], 'recal_' + os.path.split(x)[1]) for x in mgf_file_list ] if split_acq: for mgf_file in mgf_file_list: ProteoFileReader.split_mgf_methods(mgf_file)