def read_scans(mzml_file, ms_levels=(1, 2), should_renumber_ifmissing=True): """ yields all spectra from an mzML file with level in ms_levels, or all processable scans if ms_levels not specified :param mzml_file: :param ms_levels: :param min_pprophet: :param should_renumber_ifmissing: If this is true and we're unable to get integer scan numbers from the spactra, renumber them from 1 to N. If false, fail if we can't parse an integer from the scan number field :return: """ with mzml.MzML(mzml_file) as reader: cur_scanidx_1based = 0 for scan in reader: cur_scanidx_1based += 1 if scan['ms level'] in ms_levels: # ignore this scan if we get a ValueError. # ValueError is only raised if we can't infer charge. # If we still have enough scans where we could infer charge, OK to # ignore these. try: yield read_scan(scan, default_scan_number=cur_scanidx_1based if should_renumber_ifmissing else None) except ValueError as e: logger.debug("Warning! Failed to read scan: %s" % e)
def readmzXML(file_paths): data_dict = {} for file_path in file_paths: data = mzml.MzML(file_path) file_num = len(data) for spectrum in data: data_dict[spectrum['spectrum title']] = {'m/z':spectrum['m/z array'],'intensity':spectrum['intensity array']} print('successfully loaded '+str(file_num)+' raw spectra from'+file_path) return data_dict
def get_spectra(source: Union[IO, str], scan_nrs: Sequence[int] = None)\ -> Iterator[MsmsSpectrum]: """ Get the MS/MS spectra from the given mzML file, optionally filtering by scan number. Parameters ---------- source : Union[IO, str] The mzML source (file name or open file object) from which the spectra are read. scan_nrs : Sequence[int] Only read spectra with the given scan numbers. If `None`, no filtering on scan number is performed. Returns ------- Iterator[MsmsSpectrum] An iterator over the requested spectra in the given file. """ with mzml.MzML(source) as f_in: # Iterate over a subset of spectra filtered by scan number. if scan_nrs is not None: def spectrum_it(): for scan_nr in scan_nrs: yield f_in.get_by_id( f'controllerType=0 controllerNumber=1 scan={scan_nr}') # Or iterate over all MS/MS spectra. else: def spectrum_it(): for spectrum_dict in f_in: if int(spectrum_dict.get('ms level', -1)) == 2: yield spectrum_dict try: for spectrum in spectrum_it(): try: yield _parse_spectrum(spectrum) except ValueError as e: pass # logger.warning(f'Failed to read spectrum %s: %s', # spectrum['id'], e) except LxmlError as e: logger.warning('Failed to read file %s: %s', source, e)
def extract_from_mzml(path): # Extract the data from the mzml, if we havnt already if not os.path.exists(f'{path}mzML.json'): if not multiprocessing: print( 'Extracting data from mzML ', end='\r') data = mzml.MzML(f'{path}file.mzML') # Extracted data extracted = {'ms1': {}, 'ms2': {}} # Extract the necessary data from spectra for spectrum in data: if spectrum['ms level'] == 1: # Scan id scan_id = int(spectrum['id'].split('scan=')[1]) # Deal with ms level 1 spectra ms1_spectrum = process_ms1(spectrum) extracted['ms1'][scan_id] = { 'mz': ms1_spectrum['mz'], 'intensity': ms1_spectrum['intensity'], 'scan_time': ms1_spectrum['scan_time'] } elif spectrum['ms level'] == 2: # Scan id scan_id = int(spectrum['id'].split('scan=')[1]) # Deal with ms level 1 spectra ms2_spectrum = process_ms2(spectrum) extracted['ms2'][scan_id] = { 'scan_index': ms2_spectrum['scan_index'], 'precursor_scan': ms2_spectrum['precursor_scan'], 'precursor_ion': ms2_spectrum['precursor_ion'], 'm/z_array': [mz for mz in ms2_spectrum['m/z']], 'rt_array': [rt for rt in ms2_spectrum['rt']] } else: pass with gzip.GzipFile(f'{path}mzML.json', 'w') as fout: fout.write(json.dumps(extracted).encode('utf-8')) fout.close()
def mzml_to_pandas_df(filename): """ Reads mzML file and returns a pandas.DataFrame. """ cols = ["retentionTime", "m/z array", "intensity array"] slices = [] file = mzml.MzML(filename) while True: try: data = file.next() data["retentionTime"] = data["scanList"]["scan"][0]["scan time"] / 60 del data["scanList"] slices.append(pd.DataFrame(data)) except: break df = pd.concat(slices)[cols] df_to_numeric(df) return df
def mzml_to_pandas_df(filename): ''' Reads mzML file and returns a pandas.DataFrame. ''' cols = ['retentionTime', 'm/z array', 'intensity array'] slices = [] file = mzml.MzML(filename) while True: try: data = file.next() data['retentionTime'] = data['scanList']['scan'][0][ 'scan time'] / 60 del data['scanList'] slices.append(pd.DataFrame(data)) except: break df = pd.concat(slices)[cols] df_to_numeric(df) return df
def mzml_to_pandas_df_pyteomics(fn): ''' Reads mzML file and returns a pandas.DataFrame. ''' cols = ['retentionTime', 'm/z array', 'intensity array'] slices = [] with mzml.MzML(fn) as ms_data: while True: try: data = ms_data.next() data['retentionTime'] = data['scanList']['scan'][0][ 'scan time'] / 60 del data['scanList'] slices.append(pd.DataFrame(data)) except: break df = pd.concat(slices)[cols] df_to_numeric(df) df['intensity array'] = df['intensity array'].astype(int) df = df.reset_index(drop=True) return df
def internalmzML(path): # Extract the data from the mzml, if we havnt already if not os.path.exists(f'{path}mzML.json'): if not multithread: print( 'Extracting data from mzML ', end='\r') data = mzml.MzML(f'{path}file.mzML') # Extracted data extracted = {'ms1': {}} # , 'ms2': {}} # Extract the necessary data from spectra for spectrum in data: if spectrum['ms level'] == 1: # Scan id scan_id = int(spectrum['id'].split('scan=')[1]) # Deal with ms level 1 spectra ms1_spectrum = process_ms1(spectrum) extracted['ms1'][scan_id] = { 'mz': ms1_spectrum['mz'], 'intensity': ms1_spectrum['intensity'], 'scan_time': ms1_spectrum['scan_time'] } else: pass # elif spectrum['ms level'] == 2: # # Scan id # scan_id = int(spectrum['id'].split('scan=')[1]) # # # Deal with ms level 1 spectra # ms1_spectrum = process_ms1(spectrum) # extracted['ms1'][scan_id] = {'mz': process_ms2['precursor_scan'], # 'intensity': process_ms2['precursor_ion'], # 'scan_time': process_ms2['scan_index']} with gzip.GzipFile(f'{path}mzML.json', 'w') as fout: fout.write(json.dumps(extracted).encode('utf-8')) fout.close() os.remove(f'{path}file.mzML')
def read(mzml_file: str, max_peaks: int = None, min_intensity: float = None)\ -> Tuple[np.ndarray, DIAScan]: """ Read an mzML file from a DIA experiment. Parameters ---------- mzml_file : str The mzML file to read. Returns ------- diadem.dataset.DIARun A DIARun object containg the raw data. """ kwargs = {"max_peaks": max_peaks, "min_intensity": min_intensity} with mzml.MzML(mzml_file) as mz_dat: scans = DIARun([ s for s in _pbar(mz_dat.map(_mkscan, kwargs=kwargs, processes=4)) ]) return scans
def qc1_main(): argparser = ArgumentParser(description="iRT peptide QC tool") argparser.add_argument('--mzml', type=str, required=True, help="MzML file") argparser.add_argument('--targets', type=str, required=True, help="Targets file") argparser.add_argument('--ms1-ppm', type=float, default=5, help="MS1 extraction window in ppm") argparser.add_argument('--ms2-prec-tolerance', type=float, default=0.01, help="MS2 precursor tolerance") argparser.add_argument('--ms2-frag-tolerance', type=float, default=1, help="MS2 precursor tolerance") argparser.add_argument('--width-1-pc', type=float, default=50, help="Chromatographic width 1 in %% of apex") argparser.add_argument('--width-2-pc', type=float, default=5, help="Chromatographic width 2 in %% of apex") argparser.add_argument('--debug', action="store_true", help="Pickle cache input file") argparser = argparser.parse_args() b_fname = ".".join(argparser.mzml.split(".")[:-1]) pdf = PdfPages(b_fname + "_Figs.pdf") if argparser.debug: import pickle import time import os if os.path.exists(argparser.mzml + ".pkl"): with open(argparser.mzml + ".pkl", "rb") as f_: _start_time = time.time() print("Unpickling") exp = lcmsms.LCMSMSExperiment( tqdm.tqdm(pickle.load(f_)), prec_tolerance=argparser.ms2_prec_tolerance) print(f"Unpickled in {time.time()-_start_time} seconds") else: print("Reading and pickling") mzml_ = list(tqdm.tqdm(mzml.MzML(argparser.mzml))) with open(argparser.mzml + ".pkl", "wb") as f_: pickle.dump(mzml_, f_) print("Pickled, parsing experiment") exp = lcmsms.LCMSMSExperiment( tqdm.tqdm(mzml_), prec_tolerance=argparser.ms2_prec_tolerance) del mzml_ else: exp = lcmsms.LCMSMSExperiment( tqdm.tqdm(mzml.MzML(argparser.mzml)), prec_tolerance=argparser.ms2_prec_tolerance) ### MS1 processing #### targets = pd.read_csv(argparser.targets, sep='\t') targets_ms1 = targets[["Sequence", "Precursor_Mz"]].drop_duplicates() results_ms1 = pd.DataFrame(columns=[ "Sequence", "Precursor_Mz", "Apex_time", f"Width_{argparser.width_1_pc}_pc_time_start", f"Width_{argparser.width_1_pc}_pc_time_end", f"Width_{argparser.width_1_pc}_xic_area", f"Width_{argparser.width_2_pc}_pc_time_start", f"Width_{argparser.width_2_pc}_pc_time_end", f"Width_{argparser.width_2_pc}_xic_area", f"MS1_mass_apex_mz", f"MS1_apex_height", f"MS1_peak_halfwidth", f"MS1_peak_area", f"TIC_MS2", ]) #from matplotlib.backends.backend_pdf import PdfPages #pdf = PdfPages('MS1.pdf') fig, axs = plt.subplots(len(targets_ms1), 4, figsize=(15, 60), gridspec_kw={'width_ratios': [1, 1, 1, 1]}) plt.subplots_adjust(hspace=0.5) n = 0 for k, row in targets_ms1.iterrows(): mz = row["Precursor_Mz"] seq = row["Sequence"] ch = exp.ms1.xic(mz, argparser.ms1_ppm) chs = ch.smooth(sigma=2) apext, apexi = ch.get_apex() width1 = chs.get_width_pc(argparser.width_1_pc) width2 = chs.get_width_pc(argparser.width_2_pc) area1 = chs.get_width_pc_area(argparser.width_1_pc) area2 = chs.get_width_pc_area(argparser.width_2_pc) spec = exp.ms1[apext] ms1_apex_mz, ms1_apex_int = spec.get_apex_around(mz, 0.05) ms1_hw = spec.get_apex_width_pc(mz, apex_pc=50, tolerance=0.05) ms1_area = spec.get_peak_area(mz, tolerance=0.05) ### PLOTS ### # XIC axs[n, 0].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 0].ticklabel_format(axis="x", style='plain') axs[n, 0].plot(ch.t, ch.i, "g-") #axs[n, 0].plot(xictimes, xic) #axs[n, 0].plot(xictimes, asym_peak(xictimes, *popt), 'r-') axs[n, 0].vlines(apext, 0, apexi * 1.1) axs[n, 0].title.set_text(f"{seq}\nmz={mz:.4f}\napex@{apext:.2f}min") axs[n, 0].set_xlim(15, 30) # XIC zoom axs[n, 1].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 1].ticklabel_format(axis="x", style='plain') axs[n, 1].plot(ch.t, ch.i, "gx-") axs[n, 1].plot(chs.t, chs.i, "rx-") #axs[n, 1].plot(xictimes, asym_peak(xictimes, *popt), 'r-') axs[n, 1].vlines(apext, 0, apexi) axs[n, 1].title.set_text(f"MS1 XIC zoon\n mz={mz:.4f}") axs[n, 1].hlines(apexi * 0.5, *width1) axs[n, 1].hlines(apexi * 0.05, *width2) axs[n, 1].set_xlim(apext - 0.2, apext + 0.4) axs[n, 1].text(0.45, 0.95, f"Area50={area1:.3e}\nArea5 ={area2:.3e}", transform=axs[n, 1].transAxes, fontsize=10, verticalalignment='top') # MS1 spectrum spec = exp.ms1[apext] axs[n, 2].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 2].ticklabel_format(axis="x", style='plain') axs[n, 2].title.set_text(f"MS1 spectrum\n@time={apext:.2f}min") spec.plot(ax=axs[n, 2], marks=[ms1_apex_mz]) # MS1 spectrum zoom ms1_tolerance = ms1_apex_mz * argparser.ms1_ppm * 1e-6 spec_zoom = spec[ms1_apex_mz - argparser.ms2_prec_tolerance:ms1_apex_mz + argparser.ms2_prec_tolerance] # No /2 (sic!) axs[n, 3].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 3].ticklabel_format(axis="x", style='sci', scilimits=(-3, -3), useOffset=ms1_apex_mz) #axs[n, 3].xaxis.set_major_formatter(FormatStrFormatter('%.2f')) spec_zoom.plot("go-", ax=axs[n, 3]) axs[n, 3].title.set_text("MS1 zoom\n mz={:.4f}".format(mz)) axs[n, 3].vlines(ms1_apex_mz, 0, ms1_apex_int, "r") ms1_w_left, ms1_w_right = spec.get_apex_times_pc(ms1_apex_mz, apex_pc=50, tolerance=0.05) axs[n, 3].hlines(ms1_apex_int / 2, ms1_w_left, ms1_w_right, "r") axs[n, 3].text(0.55, 0.95, f"Area={ms1_area:.2e}\nHW={ms1_hw:.2e}", transform=axs[n, 3].transAxes, fontsize=10, verticalalignment='top') n += 1 ############ row['Apex_time'] = apext row[f"Width_{argparser.width_1_pc}_pc_time_start"] = width1[0] row[f"Width_{argparser.width_1_pc}_pc_time_end"] = width1[1] row[f"Width_{argparser.width_1_pc}_xic_area"] = area1 row[f"Width_{argparser.width_2_pc}_pc_time_start"] = width2[0] row[f"Width_{argparser.width_2_pc}_pc_time_end"] = width2[1] row[f"Width_{argparser.width_2_pc}_xic_area"] = area2 row[f"MS1_mass_apex_mz"] = ms1_apex_mz row[f"MS1_apex_height"] = ms1_apex_int row[f"MS1_peak_halfwidth"] = ms1_hw row[f"MS1_peak_area"] = ms1_area results_ms1 = results_ms1.append(row) pdf.savefig(fig) plt.close(fig) ### MS2 processing ### results_ms1.set_index("Sequence", drop=True, inplace=True) targets_ms2 = targets[["Sequence", "Precursor_Mz", "Product_Mz"]].drop_duplicates() results_ms2 = pd.DataFrame(columns=[ "Sequence", "Precursor_Mz", "Product_Mz", "MS2_TIC_Apex_time", "MS2_mass_apex_mz", "MS2_apex_height", "MS2_peak_halfwidth", "MS2_peak_area", ]) n_ = max( map(lambda x: len(x[1]), targets_ms2.groupby(by=["Sequence", "Precursor_Mz"]))) fig, axs = plt.subplots(len(targets_ms1), 3 + n_, figsize=(15, 100)) plt.subplots_adjust(hspace=0.5) plt.subplots_adjust(hspace=0.5) n = -1 for k, grp in targets_ms2.groupby(by=["Sequence", "Precursor_Mz"]): n += 1 seq = k[0] prec = k[1] apext = results_ms1.loc[seq, "Apex_time"] start = results_ms1.loc[seq, f"Width_{argparser.width_2_pc}_pc_time_start"] stop = results_ms1.loc[seq, f"Width_{argparser.width_2_pc}_pc_time_end"] ms2_all = exp.ms2.extract(prec) ms2_ext = ms2_all[start - argparser.ms2_frag_tolerance / 2:stop + argparser.ms2_frag_tolerance / 2] #spec = ms2_ext[apext] tic_apext, tic_apexint = ms2_ext.tic.get_apex() results_ms1.loc[seq, "TIC_MS2"] = tic_apexint spec = ms2_ext[tic_apext] ### PLOTS ### # TIC MS2 axs[n, 0].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 0].ticklabel_format(axis="x", style='plain') axs[n, 0].plot(ms2_all.tic.t, ms2_all.tic.i, "g-") axs[n, 0].title.set_text("TIC MS2\nmz={:.4f}\n apex@{:.2f}\n".format( prec, tic_apext)) # TIC MS2 zoom axs[n, 1].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 1].ticklabel_format(axis="x", style='plain') axs[n, 1].plot(ms2_ext.tic.t, ms2_ext.tic.i, "g-") axs[n, 1].vlines(tic_apext, 0, tic_apexint, "r") axs[n, 1].title.set_text( f"TIC MS2 zoom\nmz={prec:.4f}\n apex@{tic_apext:.2f}\n") # MS2 spectrum axs[n, 2].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) axs[n, 2].ticklabel_format(axis="x", style='plain') spec.plot(ax=axs[n, 2]) axs[n, 2].title.set_text( f"MS/MS for\n{prec:.4f}\n@time={tic_apext:.2f}min\n") nn = 3 for kk, row in grp.iterrows(): frag = row["Product_Mz"] try: fmz, fint = spec.get_apex_around(frag, argparser.ms2_frag_tolerance) f_hw = spec.get_apex_width_pc( frag, apex_pc=50, tolerance=argparser.ms2_frag_tolerance) f_area = spec.get_peak_area( fmz, tolerance=argparser.ms2_frag_tolerance) s_ext = spec[fmz - argparser.ms2_frag_tolerance / 2:fmz + argparser.ms2_frag_tolerance / 2] s_ext.plot("go-", ax=axs[n, nn]) axs[n, nn].ticklabel_format(axis="y", style='sci', scilimits=(0, 0)) #axs[n, nn].ticklabel_format(axis="x", style='sci', scilimits=(-2,-2), useOffset=fmz) axs[n, nn].ticklabel_format(axis="x", style='plain') axs[n, nn].title.set_text(f"MS2 zoom\nmz={fmz:.4f}\n") axs[n, nn].vlines(fmz, 0, max(s_ext.i), "r") axs[n, nn].vlines(frag, 0, max(s_ext.i), "blue") axs[n, 2].plot([frag], [fint], "rx") #, markersize=15) ms2_w_left, ms2_w_right = s_ext.get_apex_times_pc( fmz, apex_pc=50, tolerance=0.05) axs[n, nn].hlines(fint / 2, ms2_w_left, ms2_w_right, "r") axs[n, nn].text(0.6, 0.98, f"Area=\n{ms1_area:.2e}\n\nHW=\n{ms1_hw:.2e}", transform=axs[n, nn].transAxes, fontsize=8, verticalalignment='top') except lcmsms.PeaksNotFound: print(f"No MS2 peak for {prec:.4f}/{frag:.4f}") f_hw = 0 f_area = 0 fmz = 0 fint = 0 nn += 1 row["MS2_TIC_Apex_time"] = tic_apext row["MS2_mass_apex_mz"] = fmz row["MS2_apex_height"] = fint row["MS2_peak_halfwidth"] = f_hw row["MS2_peak_area"] = f_area results_ms2 = results_ms2.append(row) pdf.savefig(fig) plt.close(fig) results_ms2.set_index("Sequence", drop=True, inplace=True) fig.savefig("MS1.pdf", dpi=1200, format='pdf', bbox_inches='tight') ms1_fname = b_fname + "_MS1_table.csv" ms2_fname = b_fname + "_MS2_table.csv" results_ms1.to_csv(ms1_fname, sep='\t') results_ms2.to_csv(ms2_fname, sep='\t') pdf.close()
mzid_df = pd.DataFrame({'file': file_location, 'id': spectrum_ids, 'seq': seq}) def _parse_mzml_entry(entry): ID = str(entry['id']) mz = np.array(entry['m/z array']) intensities = np.array(entry['intensity array']) return ID, mz, intensities all_spectra = [] for file in np.unique(file_location): print(file) indexed = mzml.MzML(file) for i, entry in enumerate(indexed.map(_parse_mzml_entry)): tupl = (file, ) + entry all_spectra.append(tupl) mzml_location, ids, mz, intensities = zip(*all_spectra) spectra_df = pd.DataFrame({ 'file': mzml_location, 'id': ids, 'mz': mz, 'intensities': intensities }) #### MERGE: mzid + mzml
import numpy as np import pandas as pd # %% # define parsing function def _parse_mzml_entry(entry): ID = str(entry['id']) mz = np.array(entry['m/z array']) intensities = np.array(entry['intensity array']) return ID, mz, intensities all_spectra = [] data = '/home/ubuntu/data/jiahao/trp/output/Run1_U4_2000ng.mzML' file = mzml.MzML(data) for i, entry in enumerate(file.map(_parse_mzml_entry)): tupl = (data, ) + entry all_spectra.append(tupl) # %% # generate pandas dataframe mzml_location, ids, mz, intensities = zip(*all_spectra) spectra_df = pd.DataFrame({ 'file': mzml_location, 'id': ids, 'mz': mz, 'intensities': intensities })