def bin_sparse_dok(mgf_file, output_file=None, min_bin=50, max_bin=2000, bin_size=0.01, verbose=False, remove_zero_sum_rows=True, remove_zero_sum_cols=True): start = time.time() bins = np.arange(min_bin, max_bin, bin_size) reader0 = mgf.MGF(mgf_file) n_spectra = len([x for x in reader0]) X = dok_matrix((len(bins), n_spectra), dtype=np.float32) reader = mgf.MGF(mgf_file) scan_names = [] for spectrum_index, spectrum in enumerate(reader): if len(spectrum['m/z array']) == 0: continue for mz, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): target_bin = math.floor((mz - min_bin) / bin_size) X[target_bin, spectrum_index] += intensity scan_names.append(spectrum['params']['scans']) X = X.tocsr() X_orig_shape = X.shape if remove_zero_sum_rows: print(X.shape) X, row_names_filter = filter_zero_rows(X) bins = [x for (x, v) in zip(bins, row_names_filter) if v] print("Removed %s rows" % (X_orig_shape[0] - X.shape[0])) if verbose else None if remove_zero_sum_cols: X, col_names_filter = filter_zero_cols(X) scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v] print("Removed %s cols" % (X_orig_shape[1] - X.shape[1])) if verbose else None if verbose: print( "Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)" % (time.time() - start, X.shape[0], X.shape[1], X.count_nonzero(), X.count_nonzero() / (n_spectra * len(bins)))) if output_file is not None: pkl.dump((X, bins, scan_names), open(output_file, "wb")) return (X, bins, scan_names)
def read_mgf_cosine(mgfFile, specific_spectra=0): spectra = [] masses = [] if specific_spectra == 0: if isinstance(mgfFile, list): for mgfs_n in mgfFile: with mgf.MGF(mgfs_n) as reader: for spectrum in reader: masses.append(spectrum['params']['pepmass'][0]) temp = [] for i in range(len(spectrum['m/z array'])): temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]]) spectra.append(temp) else: with mgf.MGF(mgfFile) as reader: for spectrum in reader: masses.append(spectrum['params']['pepmass'][0]) temp = [] for i in range(len(spectrum['m/z array'])): temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]]) spectra.append(temp) else: if isinstance(mgfFile, list): for n, mgfs_n in enumerate(mgfFile): with mgf.MGF(mgfs_n) as reader: s_in_mgf = [] for s in specific_spectra: if s[0] - 1 == n: s_in_mgf.append(s[1]) for k, spectrum in enumerate(reader): for j in s_in_mgf: if k == j - 1: masses.append(spectrum['params']['pepmass'][0]) temp = [] for i in range(len(spectrum['m/z array'])): temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]]) spectra.append(temp) else: with mgf.MGF(mgfFile) as reader: for k, spectrum in enumerate(reader): for j in specific_spectra: if k == j - 1: masses.append(spectrum['params']['pepmass'][0]) temp = [] for i in range(len(spectrum['m/z array'])): temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]]) spectra.append(temp) return spectra, masses
def _create_parser(self): if self._use_index: return _MGFParser(self.source_file, read_charges=False, convert_arrays=1, encoding=self.encoding) return mgf.MGF(self.source_file, read_charges=False, convert_arrays=1, encoding=self.encoding)
def get_rt(file_location): #returns a dictionary mapping each scan to the RT retention_times = {} with open(file_location, 'r') as f: reader = mgf.MGF(f) for spec in reader: rt = spec['params']['rtinseconds'] scan = spec['params']['scans'] assert (scan not in retention_times) retention_times[scan] = rt return retention_times
def _create_parser(self): if self._use_index: return _MGFParser(self.source_file, read_charges=False, convert_arrays=1, encoding=self.encoding) simple_reader = mgf.MGF(self.source_file, read_charges=False, convert_arrays=1, encoding=self.encoding) simple_reader.index = OffsetIndex() return simple_reader
def read_mgf_iterative(self): # xtandem default spectrum parameter: # spectrum, minimum fragment mz Mfmin = 200 -> peaks caused by individual amino acid residues tend to be relatively small (m/z < 200) # spectrum, minimum peaks = 5 -> screen out spectra that contain too few fragment ions to be usefully interpreted. # spectrum, minimum parent m+h (mass + proton) = 850 -> supress the analysis of spectra that were generated by low mass parent ions # -> This parameter is not used if the value of spectrum, use noise suppression = no = default # spectrum, dynamic range = 100 -> threshold peaks, highest normalized intensity values to 100, all others linear scaled down, if peak<1 rejected # {'params': # {title 'Run1_U1_2000ng.5651.5651.4 File:"Run1_U1_2000ng.raw", NativeID:"controllerType=0 controllerNumber=1 scan=5651"', # 'rtinseconds': int, 'pepmass': (423.71 (mass), 694374.1875(intensity)), 'charge': [int] }, # 'm/z array': [np array], 'intensity array': [np array], 'charged array':[masked array] # searchgui xtandem param "dynamicRange": 100.0, # "nPeaks": 50, # "minPrecursorMass": 500.0, # "minFragmentMz": 200.0, # "minPeaksPerSpectrum": 5, # "useNoiseSuppression": false, # default settings: passing spectra = 163113 # without charge one: passing spectra = 88666 (xtandem used 88279) reader = mgf.MGF(source=self.spectra_file, use_header=True, convert_arrays=2, read_charges=True, dtype=None, encoding='utf-8', read_ions=False) removed_spectra= set() number_of_spectra=0 for spectrum in reader: number_of_spectra+=1 charge = 1 if 'charge' not in spectrum['params'].keys() else int(spectrum['params']['charge'][0]) if charge == 1: removed_spectra.add(spectrum['params']['title']) elif len(spectrum['m/z array']) < 5: removed_spectra.add(spectrum['params']['title']) elif charge > 4: removed_spectra.add(spectrum['params']['title']) # elif int(spectrum['params']['pepmass'][0]) < 500: # not used since "useNoiseSuppression": false, # removed_spectra.append(spectrum['params']['title']) # passing spectra: else: intensity_peaks = self.remove_peaks(spectrum['intensity array'], mz_min=200, dynamic_range=100) if len(intensity_peaks) < 5: removed_spectra.add(spectrum['params']['title']) # try: # if int(spectrum['params']['pepmass'][0]) < 850: # removed_spectra.append(spectrum['params']['title']) # except TypeError: # None # pass print(f"Number of spectra: {number_of_spectra}") print(f"Number of spectra not passing quality control: {len(removed_spectra)}") print(f"Number of spectra passing quality control: {number_of_spectra-len(removed_spectra)}")
def read_mgf(self, mgf_filename: Union[str, None] = None, no_new_psms: bool = False): """Read retention times from MGF file.""" if not mgf_filename: mgf_filename = self.get_mgf_filename() with mgf.MGF(mgf_filename) as reader: for spectrum in reader: scan = int(spectrum["params"]["scans"]) retention_time = float(spectrum["params"]["rtinseconds"]) if scan not in self.peptide_spectrum_matches: if not no_new_psms: psm = PeptideSpectrumMatch( scan=scan, retention_time=retention_time) self.peptide_spectrum_matches[scan] = psm else: self.peptide_spectrum_matches[ scan].retention_time = retention_time
def get_spectra(source: Union[IO, str], scan_nrs: Sequence[int] = None)\ -> Iterator[MsmsSpectrum]: """ Get the MS/MS spectra from the given MGF file, optionally filtering by scan number. Parameters ---------- source : Union[IO, str] The MGF source (file name or open file object) from which the spectra are read. scan_nrs : Sequence[int] Only read spectra with the given scan numbers. If `None`, no filtering on scan number is performed. Returns ------- Iterator[MsmsSpectrum] An iterator over the requested spectra in the given file. """ with mgf.MGF(source) as f_in: # Iterate over a subset of spectra filtered by scan number. if scan_nrs is not None: def spectrum_it(): for scan_nr, spectrum_dict in enumerate(f_in): if scan_nr in scan_nrs: yield spectrum_dict # Or iterate over all MS/MS spectra. else: def spectrum_it(): yield from f_in for spectrum in spectrum_it(): try: yield _parse_spectrum(spectrum) except ValueError as e: pass
for mgf_set_object in mgf_sets: name = mgf_set_object.name mgf_objects = mgf_set_object.mgfs psm_parent_names = mgf_set_object.psm_parent_names fieldnames = ['left', 'center', 'right'] for mgf_object in mgf_objects: mgf_basename = os.path.basename(mgf_object.path) fieldnames.append(mgf_basename) fieldnames.extend([mgf_basename + '-' + x for x in psm_parent_names]) histograms = [] for mgf_object in mgf_objects: mgf_basename = os.path.basename(mgf_object.path) spectra = {} mgf_masses = [] with open(mgf_object.path, 'r') as g: spec_iter = mgf.MGF(g) for x in spec_iter: assert ('scans' in x['params']) assert (isinstance(x['params']['scans'], str)) spectra[x['params']['scans']] = Spectrum( x['params']['pepmass'][0], x['params']['charge'][0]) mgf_masses.append(x['params']['pepmass'][0] * x['params']['charge'][0]) mgf_masses.sort() unnormalized_hist, hist, temp_bin_centers, temp_bin_edges = create_hist( mgf_masses, num_bins, min_mass, max_mass) if bin_centers is None and bin_edges is None: bin_centers = list(temp_bin_centers) bin_edges = list(temp_bin_edges) assert (len(bin_centers) + 1 == len(bin_edges)) hist = MassHist(mgf_basename, list(unnormalized_hist))
from pyteomics import mgf import pandas as pd from rdkit import Chem from rdkit.Chem import Descriptors import sys # 'NIST_GC_EI_MAX_LIBRARY.mgf' if __name__=='__main__': filename = sys.argv[1] spectra = [] with mgf.MGF(filename) as reader: for spectrum in reader: temp = {} inchi = spectrum['params'].get('inchi') if inchi!=None: mol = Chem.MolFromInchi(spectrum['params'].get('inchi')) else: mol = None temp['FILENAME'] = filename temp['SEQ'] = '*..*' temp['COMPOUND_NAME'] = spectrum['params'].get('name') try: temp['MOLECULEMASS'] = Descriptors.MolWt(mol) except: temp['MOLECULEMASS'] = 'N/A' temp['INSTRUMENT'] = spectrum['params'].get('instrument') temp['IONSOURCE'] = spectrum['params'].get('source_instrument') temp['EXTRACTSCAN'] = spectrum['params'].get('scans') temp['SMILES'] = spectrum['params'].get('smiles') temp['INCHI'] = spectrum['params'].get('inchi')
scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v] print("Removed %s cols" % (X_orig_shape[1] - X.shape[1] )) if verbose else None if verbose: print("Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)" % (time.time()-start, X.shape[0], X.shape[1], X.count_nonzero(), X.count_nonzero()/(n_scans*len(bins)))) if output_file is not None: pkl.dump((X, bins, scan_names),open( output_file, "wb")) return(X, bins, scan_names) def row_filter_intensity(X, bin_names, threshold = 1/1000): colsums = np.array(X.sum(axis = 0)).flatten() for i in range(X.shape[1]): X[:, i] = X[:, i]/colsums[i] rowsums = np.array(X.sum(axis = 1)).flatten() rowkeep = rowsums > threshold X = X[rowkeep, :] bin_names = [x for (x, v) in zip(bin_names, rowkeep) if v] return((X, bin_names)) files = ["BILELIB19.mgf", "GNPS-NIH-CLINICALCOLLECTION1.mgf"] all_spectra = [] for f in files: content = mgf.MGF(f) for spectra in content: if spectra['params']["source_instrument"].find("qT") > 0: all_spectra.append(spectra) bin_sparse_dok(mgf = all_spectra, verbose = True, bin_size = 0.1, output_file = "metabolite_matrix.pkl")
def bin_sparse_dok(mgf_file=None, mgf_files=None, output_file = None, min_bin = 50, max_bin = 850, bin_size = 0.01, max_parent_mass = 850, verbose = False, remove_zero_sum_rows = True, remove_zero_sum_cols = True, window_filter = True, filter_window_size = 50, filter_window_retain = 3, filter_parent_peak = True): """ Bins an mgf file Bins an mgf of ms2 spectra and returns a sparse dok matrix. Operates on either a single or a list of mgf files. Args: mgf_file: The path of an mgf file. mgf_files: A list of mgf files. output_file = Name of output file in pickle format. min_bin = smallest m/z value to be binned. max_bin = largest m/z value to be binned. bin_size: M/z range in one bin. max_parent_mass: Remove ions larger than this. verbose: Print debug info. remove_zero_sum_rows: Explicitly remove empty rows (bins). remove_zero_sum_cols: Explicitly remove spectra were all values were filtered away (columns) filter_parent_peak: Remove all ms2 peaks larger than the parent mass returns: A sparse dok matrix X, a list of bin names, and a list of spectra names """ start = time.time() bins = np.arange(min_bin, max_bin, bin_size) if mgf_file != None: mgf_files = [mgf_file] n_scans = 0 for file in mgf_files: reader0 = mgf.MGF(file) n_scans += len([x for x in reader0]) X = dok_matrix((len(bins), n_scans), dtype=np.float32) scan_names = [] for file in mgf_files: reader = mgf.MGF(file) base = os.path.basename(file) for spectrum_index, spectrum in enumerate(reader): scan_names.append(os.path.splitext(base)[0] + "_" + spectrum['params']['scans']) if spectrum['params']['pepmass'][0] > max_parent_mass: continue if len(spectrum['m/z array']) == 0: continue if window_filter: spectrum = filter_window(spectrum, filter_window_size, filter_window_retain) for mz, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): if mz > max_bin or mz > spectrum['params']['pepmass'][0]: continue target_bin = math.floor((mz - min_bin)/bin_size) X[target_bin-1, spectrum_index] += intensity X = X.tocsr() X_orig_shape = X.shape if remove_zero_sum_rows: print(X.shape) X, row_names_filter = filter_zero_rows(X) bins = [x for (x, v) in zip(bins, row_names_filter) if v] print("Removed %s rows" % (X_orig_shape[0] - X.shape[0] )) if verbose else None if remove_zero_sum_cols: X, col_names_filter = filter_zero_cols(X) scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v] print("Removed %s cols" % (X_orig_shape[1] - X.shape[1] )) if verbose else None if verbose: print("Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)" % (time.time()-start, X.shape[0], X.shape[1], X.count_nonzero(), X.count_nonzero()/(n_scans*len(bins)))) if output_file is not None: pkl.dump((X, bins, scan_names),open( output_file, "wb")) return(X, bins, scan_names)
#!/usr/bin/env python3 from pyteomics import mgf import numpy as np import pandas as pd import os bin_number = 3000 mz_range = (0, 1200) bins = np.linspace(mz_range[0], mz_range[1], num=bin_number) mgf_data = ['./data/agp500.mgf'] reader = [(os.path.basename(x), mgf.MGF(x)) for x in mgf_data] spectra_bins = {} for (name, mgfs) in reader: for index, m in enumerate([x for x in mgfs]): spectra_bins[name + "_" + str(m['params']['scans'])] = np.digitize( m["m/z array"], bins) matrix = pd.DataFrame(0, index=bins, columns=spectra_bins.keys()) colnames = [] print(matrix.shape) for k, v in spectra_bins.items(): colnames.append(k) for i in v: # print(i,k) matrix.iloc[i - 1].loc[k] = 1 print(matrix.shape) matrix = matrix[(matrix.T != 0).any()]