예제 #1
0
파일: fast_binner.py 프로젝트: askerdb/emp
def bin_sparse_dok(mgf_file,
                   output_file=None,
                   min_bin=50,
                   max_bin=2000,
                   bin_size=0.01,
                   verbose=False,
                   remove_zero_sum_rows=True,
                   remove_zero_sum_cols=True):
    start = time.time()
    bins = np.arange(min_bin, max_bin, bin_size)

    reader0 = mgf.MGF(mgf_file)
    n_spectra = len([x for x in reader0])
    X = dok_matrix((len(bins), n_spectra), dtype=np.float32)
    reader = mgf.MGF(mgf_file)
    scan_names = []
    for spectrum_index, spectrum in enumerate(reader):
        if len(spectrum['m/z array']) == 0:
            continue
        for mz, intensity in zip(spectrum['m/z array'],
                                 spectrum['intensity array']):
            target_bin = math.floor((mz - min_bin) / bin_size)
            X[target_bin, spectrum_index] += intensity
            scan_names.append(spectrum['params']['scans'])

    X = X.tocsr()
    X_orig_shape = X.shape
    if remove_zero_sum_rows:
        print(X.shape)
        X, row_names_filter = filter_zero_rows(X)
        bins = [x for (x, v) in zip(bins, row_names_filter) if v]
        print("Removed %s rows" %
              (X_orig_shape[0] - X.shape[0])) if verbose else None

    if remove_zero_sum_cols:
        X, col_names_filter = filter_zero_cols(X)
        scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v]
        print("Removed %s cols" %
              (X_orig_shape[1] - X.shape[1])) if verbose else None

    if verbose:
        print(
            "Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)"
            % (time.time() - start, X.shape[0], X.shape[1], X.count_nonzero(),
               X.count_nonzero() / (n_spectra * len(bins))))

    if output_file is not None:
        pkl.dump((X, bins, scan_names), open(output_file, "wb"))
    return (X, bins, scan_names)
예제 #2
0
def read_mgf_cosine(mgfFile, specific_spectra=0):
	spectra = []
	masses = []
	if specific_spectra == 0:
		if isinstance(mgfFile, list):
			for mgfs_n in mgfFile:
				with mgf.MGF(mgfs_n) as reader:
					for spectrum in reader:
						masses.append(spectrum['params']['pepmass'][0])
						temp = []
						for i in range(len(spectrum['m/z array'])):
							temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]])
						spectra.append(temp)
		else:
			with mgf.MGF(mgfFile) as reader:
				for spectrum in reader:
					masses.append(spectrum['params']['pepmass'][0])
					temp = []
					for i in range(len(spectrum['m/z array'])):
						temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]])
					spectra.append(temp)
	else:
		if isinstance(mgfFile, list):
			for n, mgfs_n in enumerate(mgfFile):
				with mgf.MGF(mgfs_n) as reader:
					s_in_mgf = []
					for s in specific_spectra:
						if s[0] - 1 == n:
							s_in_mgf.append(s[1])
					for k, spectrum in enumerate(reader):
						for j in s_in_mgf:
							if k == j - 1:
								masses.append(spectrum['params']['pepmass'][0])
								temp = []
								for i in range(len(spectrum['m/z array'])):
									temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]])
								spectra.append(temp)
		else:
			with mgf.MGF(mgfFile) as reader:
				for k, spectrum in enumerate(reader):
					for j in specific_spectra:
						if k == j - 1:
							masses.append(spectrum['params']['pepmass'][0])
							temp = []
							for i in range(len(spectrum['m/z array'])):
								temp.append([spectrum['m/z array'][i], spectrum['intensity array'][i]])
							spectra.append(temp)

	return spectra, masses
예제 #3
0
 def _create_parser(self):
     if self._use_index:
         return _MGFParser(self.source_file,
                           read_charges=False,
                           convert_arrays=1,
                           encoding=self.encoding)
     return mgf.MGF(self.source_file,
                    read_charges=False,
                    convert_arrays=1,
                    encoding=self.encoding)
예제 #4
0
def get_rt(file_location):
    #returns a dictionary mapping each scan to the RT
    retention_times = {}
    with open(file_location, 'r') as f:
        reader = mgf.MGF(f)
        for spec in reader:
            rt = spec['params']['rtinseconds']
            scan = spec['params']['scans']
            assert (scan not in retention_times)
            retention_times[scan] = rt
    return retention_times
예제 #5
0
 def _create_parser(self):
     if self._use_index:
         return _MGFParser(self.source_file,
                           read_charges=False,
                           convert_arrays=1,
                           encoding=self.encoding)
     simple_reader = mgf.MGF(self.source_file,
                             read_charges=False,
                             convert_arrays=1,
                             encoding=self.encoding)
     simple_reader.index = OffsetIndex()
     return simple_reader
 def read_mgf_iterative(self):
     # xtandem default spectrum parameter:
     # spectrum, minimum fragment mz Mfmin = 200 -> peaks caused by individual amino acid residues tend to be relatively small (m/z < 200)
     # spectrum, minimum peaks = 5 -> screen out spectra that contain too few fragment ions to be usefully interpreted.
     # spectrum, minimum parent m+h (mass + proton) = 850 ->  supress the analysis of spectra that were generated by low mass parent ions
     # -> This parameter is not used if the value of spectrum, use noise suppression = no = default
     # spectrum, dynamic range = 100 -> threshold peaks, highest normalized intensity values to 100, all others linear scaled down, if peak<1 rejected
     # {'params':
     # {title 'Run1_U1_2000ng.5651.5651.4 File:"Run1_U1_2000ng.raw", NativeID:"controllerType=0 controllerNumber=1 scan=5651"',
     # 'rtinseconds': int, 'pepmass': (423.71 (mass), 694374.1875(intensity)), 'charge': [int] },
     # 'm/z array': [np array], 'intensity array': [np array], 'charged array':[masked array]
     # searchgui xtandem param  "dynamicRange": 100.0,
     #           "nPeaks": 50,
     #           "minPrecursorMass": 500.0,
     #           "minFragmentMz": 200.0,
     #           "minPeaksPerSpectrum": 5,
     #           "useNoiseSuppression": false,
     # default settings: passing spectra = 163113
     # without charge one: passing spectra = 88666 (xtandem used 88279)
     reader = mgf.MGF(source=self.spectra_file, use_header=True, convert_arrays=2, read_charges=True, dtype=None,
                             encoding='utf-8', read_ions=False)
     removed_spectra= set()
     number_of_spectra=0
     for spectrum in reader:
         number_of_spectra+=1
         charge = 1 if 'charge' not in spectrum['params'].keys() else int(spectrum['params']['charge'][0])
         if charge == 1:
             removed_spectra.add(spectrum['params']['title'])
         elif len(spectrum['m/z array']) < 5:
             removed_spectra.add(spectrum['params']['title'])
         elif charge > 4:
             removed_spectra.add(spectrum['params']['title'])
         # elif int(spectrum['params']['pepmass'][0]) < 500: # not used since "useNoiseSuppression": false,
         #     removed_spectra.append(spectrum['params']['title']) # passing spectra:
         else:
             intensity_peaks = self.remove_peaks(spectrum['intensity array'], mz_min=200, dynamic_range=100)
             if len(intensity_peaks) < 5:
                 removed_spectra.add(spectrum['params']['title'])
         # try:
         #     if int(spectrum['params']['pepmass'][0]) < 850:
         #         removed_spectra.append(spectrum['params']['title'])
         # except TypeError: # None
         #     pass
     print(f"Number of spectra: {number_of_spectra}")
     print(f"Number of spectra not passing quality control: {len(removed_spectra)}")
     print(f"Number of spectra passing quality control: {number_of_spectra-len(removed_spectra)}")
 def read_mgf(self,
              mgf_filename: Union[str, None] = None,
              no_new_psms: bool = False):
     """Read retention times from MGF file."""
     if not mgf_filename:
         mgf_filename = self.get_mgf_filename()
     with mgf.MGF(mgf_filename) as reader:
         for spectrum in reader:
             scan = int(spectrum["params"]["scans"])
             retention_time = float(spectrum["params"]["rtinseconds"])
             if scan not in self.peptide_spectrum_matches:
                 if not no_new_psms:
                     psm = PeptideSpectrumMatch(
                         scan=scan, retention_time=retention_time)
                     self.peptide_spectrum_matches[scan] = psm
             else:
                 self.peptide_spectrum_matches[
                     scan].retention_time = retention_time
예제 #8
0
def get_spectra(source: Union[IO, str], scan_nrs: Sequence[int] = None)\
        -> Iterator[MsmsSpectrum]:
    """
    Get the MS/MS spectra from the given MGF file, optionally filtering by
    scan number.

    Parameters
    ----------
    source : Union[IO, str]
        The MGF source (file name or open file object) from which the spectra
        are read.
    scan_nrs : Sequence[int]
        Only read spectra with the given scan numbers. If `None`, no filtering
        on scan number is performed.

    Returns
    -------
    Iterator[MsmsSpectrum]
        An iterator over the requested spectra in the given file.
    """
    with mgf.MGF(source) as f_in:
        # Iterate over a subset of spectra filtered by scan number.
        if scan_nrs is not None:

            def spectrum_it():
                for scan_nr, spectrum_dict in enumerate(f_in):
                    if scan_nr in scan_nrs:
                        yield spectrum_dict

        # Or iterate over all MS/MS spectra.
        else:

            def spectrum_it():
                yield from f_in

        for spectrum in spectrum_it():
            try:
                yield _parse_spectrum(spectrum)
            except ValueError as e:
                pass
예제 #9
0
for mgf_set_object in mgf_sets:
    name = mgf_set_object.name
    mgf_objects = mgf_set_object.mgfs
    psm_parent_names = mgf_set_object.psm_parent_names
    fieldnames = ['left', 'center', 'right']
    for mgf_object in mgf_objects:
        mgf_basename = os.path.basename(mgf_object.path)
        fieldnames.append(mgf_basename)
        fieldnames.extend([mgf_basename + '-' + x for x in psm_parent_names])
    histograms = []
    for mgf_object in mgf_objects:
        mgf_basename = os.path.basename(mgf_object.path)
        spectra = {}
        mgf_masses = []
        with open(mgf_object.path, 'r') as g:
            spec_iter = mgf.MGF(g)
            for x in spec_iter:
                assert ('scans' in x['params'])
                assert (isinstance(x['params']['scans'], str))
                spectra[x['params']['scans']] = Spectrum(
                    x['params']['pepmass'][0], x['params']['charge'][0])
                mgf_masses.append(x['params']['pepmass'][0] *
                                  x['params']['charge'][0])
        mgf_masses.sort()
        unnormalized_hist, hist, temp_bin_centers, temp_bin_edges = create_hist(
            mgf_masses, num_bins, min_mass, max_mass)
        if bin_centers is None and bin_edges is None:
            bin_centers = list(temp_bin_centers)
            bin_edges = list(temp_bin_edges)
            assert (len(bin_centers) + 1 == len(bin_edges))
        hist = MassHist(mgf_basename, list(unnormalized_hist))
예제 #10
0
from pyteomics import mgf
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import sys

# 'NIST_GC_EI_MAX_LIBRARY.mgf'

if __name__=='__main__':
    filename = sys.argv[1]
    spectra = []
    with mgf.MGF(filename) as reader:
        for spectrum in reader:
            temp = {}
            inchi = spectrum['params'].get('inchi')
            if inchi!=None:
                mol = Chem.MolFromInchi(spectrum['params'].get('inchi'))
            else:
                mol = None
            temp['FILENAME'] = filename
            temp['SEQ'] = '*..*'
            temp['COMPOUND_NAME'] = spectrum['params'].get('name')
            try:
                temp['MOLECULEMASS'] = Descriptors.MolWt(mol)
            except:
                temp['MOLECULEMASS'] = 'N/A'
            temp['INSTRUMENT'] = spectrum['params'].get('instrument')
            temp['IONSOURCE'] = spectrum['params'].get('source_instrument')
            temp['EXTRACTSCAN'] = spectrum['params'].get('scans')
            temp['SMILES'] = spectrum['params'].get('smiles')
            temp['INCHI'] = spectrum['params'].get('inchi')
예제 #11
0
        scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v]
        print("Removed %s cols" % (X_orig_shape[1] - X.shape[1] )) if verbose else None
        
    if verbose:
            print("Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)" % (time.time()-start, X.shape[0], X.shape[1], X.count_nonzero(), X.count_nonzero()/(n_scans*len(bins))))

    if output_file is not None:
        pkl.dump((X, bins, scan_names),open( output_file, "wb"))
    return(X, bins, scan_names)

def row_filter_intensity(X, bin_names, threshold = 1/1000):
    colsums = np.array(X.sum(axis = 0)).flatten()
    for i in range(X.shape[1]):
        X[:, i] = X[:, i]/colsums[i]
    rowsums = np.array(X.sum(axis = 1)).flatten()
    rowkeep = rowsums > threshold
    X = X[rowkeep, :]
    bin_names = [x for (x, v) in zip(bin_names, rowkeep) if v]
    return((X, bin_names))
    
files = ["BILELIB19.mgf", "GNPS-NIH-CLINICALCOLLECTION1.mgf"]

all_spectra = []
for f in files:
    content = mgf.MGF(f)
    for spectra in content:
        if spectra['params']["source_instrument"].find("qT") > 0:
            all_spectra.append(spectra)

bin_sparse_dok(mgf = all_spectra, verbose = True, bin_size = 0.1, output_file = "metabolite_matrix.pkl")
예제 #12
0
def bin_sparse_dok(mgf_file=None, mgf_files=None, output_file = None, min_bin = 50, max_bin = 850, bin_size = 0.01, max_parent_mass = 850, verbose = False, remove_zero_sum_rows = True, remove_zero_sum_cols = True, window_filter = True, filter_window_size = 50, filter_window_retain = 3, filter_parent_peak = True):
    """ Bins an mgf file 

    Bins an mgf of ms2 spectra and returns a sparse dok matrix. Operates on either a single or a list of mgf files.

    Args:
    mgf_file: The path of an mgf file.
    mgf_files: A list of mgf files.
    output_file = Name of output file in pickle format.
    min_bin = smallest m/z value to be binned.
    max_bin = largest m/z value to be binned.
    bin_size: M/z range in one bin.
    max_parent_mass: Remove ions larger than this.
    verbose: Print debug info.
    remove_zero_sum_rows: Explicitly remove empty rows (bins).
    remove_zero_sum_cols: Explicitly remove spectra were all values were filtered away (columns)
    filter_parent_peak: Remove all ms2 peaks larger than the parent mass
    returns:
    A sparse dok matrix X, a list of bin names, and a list of spectra names 
    """
    start = time.time()
    bins = np.arange(min_bin, max_bin, bin_size)

    if mgf_file != None:
        mgf_files = [mgf_file]
    
    n_scans = 0
    for file in mgf_files:
        reader0 = mgf.MGF(file)
        n_scans += len([x for x in reader0])

    X = dok_matrix((len(bins), n_scans), dtype=np.float32)
    scan_names = []
    for file in mgf_files:
        reader = mgf.MGF(file)
        base = os.path.basename(file)
        for spectrum_index, spectrum in enumerate(reader):
            scan_names.append(os.path.splitext(base)[0] + "_" + spectrum['params']['scans'])
            if spectrum['params']['pepmass'][0] > max_parent_mass:
                continue
            if len(spectrum['m/z array']) == 0:
                continue
            if window_filter:
                spectrum = filter_window(spectrum, filter_window_size, filter_window_retain)
            for mz, intensity in zip(spectrum['m/z array'], spectrum['intensity array']):
                if mz > max_bin or mz > spectrum['params']['pepmass'][0]:
                    continue
                target_bin = math.floor((mz - min_bin)/bin_size)
                X[target_bin-1, spectrum_index] += intensity

    X = X.tocsr()
    X_orig_shape = X.shape
    if remove_zero_sum_rows:
        print(X.shape)
        X, row_names_filter = filter_zero_rows(X)
        bins = [x for (x, v) in zip(bins, row_names_filter) if v]
        print("Removed %s rows" % (X_orig_shape[0] - X.shape[0] )) if verbose else None

    if remove_zero_sum_cols:
        X, col_names_filter = filter_zero_cols(X)
        scan_names = [x for (x, v) in zip(scan_names, col_names_filter) if v]
        print("Removed %s cols" % (X_orig_shape[1] - X.shape[1] )) if verbose else None
        
    if verbose:
            print("Binned in %s seconds with dimensions %sx%s, %s nonzero entries (%s)" % (time.time()-start, X.shape[0], X.shape[1], X.count_nonzero(), X.count_nonzero()/(n_scans*len(bins))))

    if output_file is not None:
        pkl.dump((X, bins, scan_names),open( output_file, "wb"))
    return(X, bins, scan_names)
예제 #13
0
#!/usr/bin/env python3

from pyteomics import mgf
import numpy as np
import pandas as pd
import os
bin_number = 3000
mz_range = (0, 1200)
bins = np.linspace(mz_range[0], mz_range[1], num=bin_number)

mgf_data = ['./data/agp500.mgf']

reader = [(os.path.basename(x), mgf.MGF(x)) for x in mgf_data]

spectra_bins = {}
for (name, mgfs) in reader:
    for index, m in enumerate([x for x in mgfs]):
        spectra_bins[name + "_" + str(m['params']['scans'])] = np.digitize(
            m["m/z array"], bins)

matrix = pd.DataFrame(0, index=bins, columns=spectra_bins.keys())
colnames = []
print(matrix.shape)
for k, v in spectra_bins.items():
    colnames.append(k)
    for i in v:
        #        print(i,k)
        matrix.iloc[i - 1].loc[k] = 1

print(matrix.shape)
matrix = matrix[(matrix.T != 0).any()]