示例#1
0
def median_filtering(filenames, scans_count, n_median, mzs_net, mz_len):
    #scans_count=scans_count//n_median
    spectra_binned = np.zeros((scans_count.sum(), mz_len))
    k = 0
    if n_median == 1:
        for filename in filenames:
            reader = pymz.read(filename)
            print(filename)
            for tree in reader:
                mz = tree['m/z array']
                spectr = tree['intensity array']
                spectra_binned[k, :] = binning(mz, spectr, mzs_net, mz_len)
                k += 1
    else:
        spectra_to_median = np.zeros((n_median, mz_len))
        for i, filename in enumerate(filenames):
            reader = pymz.read(filename)
            print(filename)
            for j in range(scans_count[i]):
                for t in range(n_median):
                    tree = next(reader)
                    mz = tree['m/z array']
                    spectr = tree['intensity array']
                    spectra_to_median[t, :] = binning(mz, spectr, mzs_net,
                                                      mz_len)
                spectra_binned[k, :] = np.median(spectra_to_median, axis=0)
                k += 1
    return spectra_binned
示例#2
0
def load_control_data(args):
    # load control dataset
    if args['control'] == '':
        print "No control file specified. Exiting BLANKA."
        sys.exit(1)
    if args['control'].endswith('.mzXML'):
        return list(pytmzxml.read(args['control']))
    elif args['control'].endswith('d') or args['control'].endswith('.RAW'):
        args['control'] = args['control'].split('.')[0] + '.mzXML'
        return list(pytmzxml.read(args['control']))
    else:
        control_files = [
            os.path.join(dirpath, files)
            for dirpath, dirnames, filenames in os.walk(args['control'])
            for files in filenames if files.endswith('.mzXML')
        ]
        if control_files == []:
            control_raw_list = raw_data_detection(args, args['control'])
            msconvert(args, control_raw_list)
            control_files = [
                os.path.join(dirpath, files)
                for dirpath, dirnames, filenames in os.walk(args['control'])
                for files in filenames if files.endswith('.mzXML')
            ]
        control_data = []
        for control in control_files:
            control_data += list(pytmzxml.read(control))
        print len(control_data)
        return control_data
示例#3
0
def MGF_generator(df, folder_mzxml, output_filename):
    temp_spectrum_manager = []
    for index, row in df.iterrows():
        fn_mzxml_file = folder_mzxml + row['Original_Peaklist'] + '.mzXML'
        with mzxml.read(fn_mzxml_file) as output:
            spectrum_1 = str(row['ScanmzXML'])
            data_spectrum1 = output[spectrum_1]
            title = str('File:' + row['Original_Peaklist'] + '.' +
                        str(row['ScanmzXML']) + ' "scan=' +
                        str(row['ScanId']) + '"')
            params_dict = {
                'TITLE': title,
                'CHARGE': str('1+'),
                'PEPMASS': str(row['ExpMz']),
                'SCANS': str(row['ScanId'])
            }
            dictionnaire = {
                'params': params_dict,
                'm/z array': data_spectrum1['m/z array'],
                'intensity array': data_spectrum1['intensity array']
            }
            temp_spectrum_manager.append(dictionnaire)

    output_filename = output_filename + '.mgf'
    mgf.write(temp_spectrum_manager, output_filename)
示例#4
0
def read_mzxml(PATH, scanlist, event_scan, fname, output, soutput, newscanno,
               spec_outfile):
    if os.path.isfile(PATH) and os.access(PATH, os.R_OK):
        with mzxml.read(PATH) as reader:
            for scanindex, spectrum in enumerate(reader):
                if scanlist.has_key(scanindex):
                    try:
                        pev = event_scan[fname][scanindex]['pev']
                        output.append("%s\t%d\t%s\t%s\t%s\n" %
                                      (pev, newscanno, spec_outfile,
                                       event_scan[fname][scanindex]['pseq'],
                                       event_scan[fname][scanindex]['etype']))
                        newscanno += 1
                        charge = int(spectrum['precursorList']['precursor'][0]
                                     ['selectedIonList']['selectedIon'][0]
                                     ['charge state'])
                        mz = spectrum['precursorList']['precursor'][0][
                            'selectedIonList']['selectedIon'][0][
                                'selected ion m/z']
                        soutput.write(
                            "BEGIN IONS\nTITLE=controllerType=0 controllerNumber=1 scan=%d\nCHARGE=%d+\nPEPMASS=%s\n"
                            % (newscanno, charge, mz))
                        for x, y in zip(spectrum['m/z array'],
                                        spectrum['intensity array']):
                            soutput.write("%s %s\n" % (x, y))
                        soutput.write("END IONS\n\n")
                    except:
                        print("Error reading mzML file")

    return newscanno
示例#5
0
def readMs1Ms2(mzXmlPath):
    with mzxml.read(mzXmlPath) as spectra:
        ms1SpecDict = {}
        ms2SpecDict = {}
        lastMs1Scan = 0
        for spectrum in spectra:
            # print(spectrum)
            # break
            if spectrum['msLevel'] == 1:
                ms1SpecDict[int(spectrum['num'])] = [
                    peak for peak in zip(spectrum['m/z array'],
                                         spectrum['intensity array'])
                ]
                lastMs1Scan = int(spectrum['num'])  ##

            if spectrum['msLevel'] == 2:
                # print('2')
                # if int(spectrum['num']) == 120:
                #     print(spectrum)
                charge = spectrum['precursorMz'][0]['precursorCharge']
                neutralPcMass = spectrum['precursorMz'][0][
                    'precursorMz'] * charge - ATOM_MASS['Z'] * charge
                ms1ScanNo = int(spectrum['precursorMz'][0]['precursorScanNum'])
                # ms1ScanNo = lastMs1Scan
                pcMz = spectrum['precursorMz'][0]['precursorMz']
                ms2SpecDict[int(spectrum['num'])] = [
                    neutralPcMass,
                    [
                        peak for peak in zip(spectrum['m/z array'],
                                             spectrum['intensity array'])
                    ], ms1ScanNo, charge, pcMz
                ]

    return ms1SpecDict, ms2SpecDict
def create_mzxml_to_csodiaq_dict(mzxmlFile):
    mzxmlToCsodiaqDict = {}
    with mzxml.read(mzxmlFile) as spectra:
        for x in spectra:
            key = (round(x['precursorMz'][0]['precursorMz'],
                         2), round(x['precursorMz'][1]['precursorMz'],
                                   2), x['compensationVoltage'])
            mzxmlToCsodiaqDict[key] = x['num']
    return mzxmlToCsodiaqDict
示例#7
0
def readPeaksFromXML(mzXmlPath):
    with mzxml.read(mzXmlPath) as spectra:
        specDict = {}
        for spectrum in spectra:
            # print(spectrum)
            # break
            specDict[int(spectrum['num'])] = [spectrum['precursorMz'][0]['precursorMz']*spectrum['precursorMz'][0]['precursorCharge'],
                                              [peak for peak in zip(spectrum['m/z array'], spectrum['intensity array'])]]
                                              
    return specDict
示例#8
0
def get_mzxml(path, cutOff=100, digits=2):
    '''Generate a sequence of rounded and trimmed spectra from individual runs of the instrument.'''
    with mzxml.read(path) as reader:
        for spectrum in reader:
            mz = spectrum['m/z array']
            intensity = spectrum['intensity array']
            if cutOff:
                mz, intensity = trim_spectrum(mz, intensity, cutOff)
            mz, intensity = round_spec(mz, intensity, )
            yield (mz, intensity)
示例#9
0
def preprocess_sample(sample):
    print "Reading " + sample
    scans = []
    r = mzxml.read(sample)
    while True:
        try:
            scans.append(r.next())
        except:
            break

    print str(len(scans)) + " scans found in " + sample
    base_peaks = {}
    all_peaks = []
    msI_list = {}
    for scan in scans:
        if scan['msLevel'] == '1':
            num = scan['num']
            mzs_MSI = scan['m/z array']
            intensities = scan['intensity array']
            msI_TIC = scan['totIonCurrent']
            msI_list[num] = {"num": num, "mzs": mzs_MSI, "TIC": msI_TIC}
        if scan['msLevel'] == '2':
            ms1_scan_num = int(scan['precursorMz'][0]['precursorScanNum'])
            base_mz = scan['precursorMz'][0]['precursorMz']
            precursor_intensity = scan['precursorMz'][0]['precursorIntensity']
            intensity_array = scan['intensity array'].tolist()
            msI_TIC = scans[ms1_scan_num]['totIonCurrent']
            ms2_TIC = scan['totIonCurrent']
            percentComposition = precursor_intensity / msI_TIC
            for x in range(0, len(intensity_array)):
                intensity_array[x] = intensity_array[x] / float(ms2_TIC)
            mzs = scan['m/z array']
            num = int(scan['num'])
            percentComposition = ms2_TIC / msI_TIC
            base_peaks[num] = {
                "num": num,
                "base_mz": base_mz,
                "intensities": intensity_array,
                "mzs": mzs,
                "precursor_intensity": precursor_intensity,
                "MSI TIC": msI_TIC,
                "MS2 TIC": ms2_TIC
            }

            all_peaks = all_peaks + mzs.tolist()
    peak_min = int(math.floor(min(all_peaks)))
    peak_max = int(math.ceil(max(all_peaks)))

    #Get rid of really big variables...
    all_peaks = None
    scans = None
    r = None

    #Returns a list of MS2 spectra organized by scan #, and the largest and smallest precursor peaks across all MS2 in this file.
    return peak_min, peak_max, base_peaks, msI_list
示例#10
0
def readMs1(mzXmlPath):
    with mzxml.read(mzXmlPath) as spectra:
        ms1SpecDict = {}
        for spectrum in spectra:
            # print(spectrum)
            # break
            ms1SpecDict[int(spectrum['num'])] = [
                peak for peak in zip(spectrum['m/z array'],
                                     spectrum['intensity array'])
            ]
    return ms1SpecDict
示例#11
0
def preprocess_sample(sample, antiBase_dict, adduct_titles, cutoff_percent,
                      scan_level):
    """ Reads mzXML files in and compares their m/z values to those from antiBase file """
    print("Reading " + sample)
    scans = []
    r = mzxml.read(sample)
    while True:
        try:
            scans.append(r.next())
        except:
            break

    print(str(len(scans)) + " scans found in " + sample)
    base_peaks = {}
    all_peaks = []
    filtered_spectra = {}

    # if you choose to compare at the level of MSI, this script will only take peaks which have
    # intensities that are a significant portion of the TIC (significance is decided by variable cutoff_percent)
    if scan_level == 1:

        for scan in scans:
            if scan['msLevel'] == 1:
                num = int(scan['num'])
                RT = scan['retentionTime']
                intensity_array_MSI = scan['intensity array']
                mzs_MSI = scan['m/z array']
                #print(mzs_MSI)
                total_TIC = scan['totIonCurrent']

                filtered_spectra_properties_list = []
                for i in range(0, len(mzs_MSI)):
                    if intensity_array_MSI[i] / total_TIC >= cutoff_percent:
                        filtered_spectra_properties_list.append(
                            float(mzs_MSI[i]))
                        filtered_spectra_properties_list.append(RT)
                        filtered_spectra[
                            num] = filtered_spectra_properties_list
    # if you compare at the level of MS-II, this only takes peaks which generate MS2's
    if scan_level == 2:
        for scan in scans:
            if scan['msLevel'] == 2:
                num = int(scan['num'])
                RT = scan['retentionTime']
                parent_mz = scan['precursorMz'][0]['precursorMz']
                filtered_spectra_properties_list = []
                filtered_spectra_properties_list.append(parent_mz)
                filtered_spectra_properties_list.append(RT)
                filtered_spectra[num] = filtered_spectra_properties_list
    scans = None

    print("Scans have been scanned")
    return antiBase_dict, filtered_spectra
示例#12
0
def readMzXML(mzml_file, msLevel = 1):
    n = 0
    with mzxml.read(mzml_file) as reader:
        for scan in reader:
            lvl = int(scan['msLevel'])
            time = float(scan['retentionTime']) * 60
            n += 1
            if scan['msLevel'] != msLevel: continue
            mzs = scan['m/z array']
            ints = scan['intensity array']
            assert mzs.shape == ints.shape
            yield time, mzs, ints, lvl, n - 1
示例#13
0
def ReadInputFile(file):
    """Funkcja jako argument przyjmuje plik mzxml z 1 widmem eksperymentalnym, 
    zwraca liste tupli [(masa1, pstwo1),...,(masaN, pstwoN)]"""
    with mzxml.read(file) as reader:
        mz_and_intensity = []
        for spectrum in reader:
            intensity = spectrum['intensity array']
            intensity_sum = np.sum(intensity)
            peaks_count = spectrum['peaksCount']
            for p in range(peaks_count):
                prob = spectrum['intensity array'][p]/intensity_sum
                mz_and_intensity.append((spectrum['m/z array'][p], prob))
    return(mz_and_intensity)
示例#14
0
def load_from_mzxml(filename: str,
                    ms_level: int = 2,
                    metadata_harmonization: bool = True
                    ) -> Generator[Spectrum, None, None]:
    """Load spectrum(s) from mzml file.

    This function will create ~matchms.Spectrum for every spectrum of desired
    ms_level found in a given MzXML file. For more extensive parsing options consider
    using the pyteomics package.

    Example:

    .. code-block:: python

        from matchms.importing import load_from_mzxml

        file_mzxml = "testdata.mzxml"
        spectrums = list(load_from_mzml(file_mzxml))

    Parameters
    ----------
    filename:
        Filename for mzXML file to import.
    ms_level:
        Specify which ms level to import. Default is 2.
    metadata_harmonization : bool, optional
        Set to False if metadata harmonization to default keys is not desired.
        The default is True.
    """
    for pyteomics_spectrum in mzxml.read(filename, dtype=dict):
        if ("ms level" in pyteomics_spectrum
                and pyteomics_spectrum["ms level"] == ms_level
                or "msLevel" in pyteomics_spectrum
                and pyteomics_spectrum["msLevel"] == ms_level):
            metadata = parse_mzml_mzxml_metadata(pyteomics_spectrum)
            mz = numpy.asarray(pyteomics_spectrum["m/z array"], dtype="float")
            intensities = numpy.asarray(pyteomics_spectrum["intensity array"],
                                        dtype="float")

            if mz.shape[0] > 0:
                # Sort by mz (if not sorted already)
                if not numpy.all(mz[:-1] <= mz[1:]):
                    idx_sorted = numpy.argsort(mz)
                    mz = mz[idx_sorted]
                    intensities = intensities[idx_sorted]

                yield Spectrum(mz=mz,
                               intensities=intensities,
                               metadata=metadata,
                               metadata_harmonization=metadata_harmonization)
示例#15
0
def load_control_data(args):
    # load control data files
    if args['output'] == '':
        directory = args['sample']
    else:
        directory = args['output']
    mzxml_list = [
        os.path.join(dirpath, files)
        for dirpath, dirnames, filenames in os.walk(directory)
        for files in filenames
        if files.startswith(args['control']) and files.endswith('.mzXML')
    ]
    return [[[list(pytmzxml.read(mzxml))[0], mzxml], mzxml_list]
            for mzxml in mzxml_list]
示例#16
0
def preprocess_sample(sample):
    print "Reading " + sample
    scans = []
    r = mzxml.read(sample)
    while True:
        try:
            scans.append(r.next())
        except:
            break

    print str(len(scans)) + " scans found in " + sample
    base_peaks = {}
    all_peaks = []
    for scan in scans:
        if scan['msLevel'] == '2':
            ms1_scan_num = int(scan['precursorMz'][0]['precursorScanNum'])
            base_mz = scan['precursorMz'][0]['precursorMz']
            precursor_intensity = scan['precursorMz'][0]['precursorIntensity']
            intensity_array = scan['intensity array'].tolist()
            msI_TIC = scans[ms1_scan_num]['totIonCurrent']
            ms2_TIC = scan['totIonCurrent']
            percentComposition = ms2_TIC / msI_TIC
            #Normalize and log transform peak intensities in each scan
            #intensities = normalize(np.log(1+np.asarray(scan['intensity array']).reshape(1,-1)), norm='l1')[0]
            for x in range(0, len(intensity_array)):
                intensity_array[x] = math.log(intensity_array[x] / ms2_TIC)
            mzs = scan['m/z array']
            num = int(scan['num'])
            percentComposition = ms2_TIC / msI_TIC
            base_peaks[num] = {
                "num": num,
                "base_mz": base_mz,
                "intensities": intensity_array,
                "mzs": mzs,
                "precursor_intensity": precursor_intensity,
                "Percent of Sample": percentComposition
            }
            all_peaks = all_peaks + mzs.tolist()

    peak_min = int(math.floor(min(all_peaks)))
    peak_max = int(math.ceil(max(all_peaks)))

    #Get rid of really big variables...
    all_peaks = None
    scans = None
    r = None

    #Returns a list of MS2 spectra organized by scan #, and the largest and smallest precursor peaks across all MS2 in this file.
    return peak_min, peak_max, base_peaks
def heavy_light_quantification(fragDict, libDict, mzxmlFiles, outDir, massTol,
                               minMatch, ratioType, correction, hist):

    finalDf = initialize_quantification_output(fragDict, libDict)

    def initialize_ratio_dict_values():
        return np.nan

    for f in mzxmlFiles:
        ppmDiffs = []
        allSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher(
        )
        scanToNoiseIntensityCutoffDict = dict()
        with mzxml.read(f, use_index=True) as file:
            for scan in sorted(libDict.keys()):

                spec = file.get_by_id(scan)

                scanToNoiseIntensityCutoffDict[int(scan)] = np.mean(
                    sorted(spec['intensity array'])[:10]) / 2

                expSpectrum = smf.format_spectra_for_pooling(spec,
                                                             scan,
                                                             sqrt=False)
                expSpectrum.sort()

                libSpectra = sorted(libDict[scan])

                quantSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher(
                )
                quantSpectraMatch.compare_spectra(libSpectra, expSpectrum,
                                                  massTol, minMatch)
                allSpectraMatch.extend_all_spectra(quantSpectraMatch)

        if correction != -1:
            allSpectraMatch.filter_by_corrected_ppm_window(
                correction, hist, minMatch)
        ratioDict = defaultdict(initialize_ratio_dict_values)
        if len(allSpectraMatch.libraryIntensities) != 0:
            ratioDict = allSpectraMatch.determine_ratios(
                ratioDict, scanToNoiseIntensityCutoffDict, ratioType, minMatch)

        finalDf[f] = [
            ratioDict[(int(row['scan']), row['peptide'])]
            for index, row in finalDf.iterrows()
        ]
    smf.print_milestone('Finish SILAC Quantification')
    return finalDf
示例#18
0
def readPeaksFromXML(mzXmlPath):
    with mzxml.read(mzXmlPath) as spectra:
        specDict = {}
        for spectrum in spectra:

            charge = spectrum['precursorMz'][0]['precursorCharge']
            neutralPcMass = spectrum['precursorMz'][0][
                'precursorMz'] * charge - ATOM_MASS['Z'] * charge
            ms1ScanNo = int(spectrum['precursorMz'][0]['precursorScanNum'])
            pcMz = spectrum['precursorMz'][0]['precursorMz']
            specDict[int(spectrum['num'])] = [
                neutralPcMass,
                [
                    peak for peak in zip(spectrum['m/z array'],
                                         spectrum['intensity array'])
                ], ms1ScanNo, charge, pcMz
            ]

    return specDict
示例#19
0
def getMS2(outDict):
    ms2Dict = dict()
    for mzXML in outDict.keys():
        reader = mzxml.read("C:/" + mzXML)
        ms2Dict[mzXML] = dict()
        nPSMs = 0
        for spec in reader:
            scanNum = spec['id']
            if scanNum in outDict[mzXML].keys():
                nPSMs += 1
                ms2Dict[mzXML][scanNum] = dict()
                ms2Dict[mzXML][scanNum]['mz'] = spec['m/z array']
                ms2Dict[mzXML][scanNum]['intensity'] = spec['intensity array']
                ms2Dict[mzXML][scanNum]['rt'] = spec['retentionTime']
            elif int(scanNum) > int(max(outDict[mzXML].keys())):
                break
            else:
                continue
        ms2Dict[mzXML] = massCorrection(ms2Dict[mzXML])
    return (ms2Dict)
示例#20
0
def get_mzxml(path, prec_digits=2):
    """Generate a sequence of rounded and trimmed spectra from individual runs of the instrument.

    Parameters
    ----------
    path : str
        Path to the mzXml file containing the mass spectrum.
    prec_digits : float
        The number of digits after which the floats get rounded.

    Returns
    -------
    out : generator
        Generates tuples of numpy arrays corresponding to different runs of the experimental spectrum.
    """
    with mzxml.read(path) as reader:
        for spectrum in reader:
            mz = spectrum['m/z array']
            intensity = spectrum['intensity array']
            mz, intensity = round_spectrum(mz, intensity, prec_digits)
            yield mz, intensity
def pool_scans_by_mz_windows(querySpectraFile):
    queWindowDict = defaultdict(list)
    queScanValuesDict = defaultdict(dict)

    with mzxml.read(querySpectraFile, use_index=True) as spectra:
        for spec in spectra:

            if 'precursorMz' not in spec: continue
            scan = spec['num']
            precMz = spec['precursorMz'][0]['precursorMz']
            windowWidth = spec['precursorMz'][0]['windowWideness']
            queWindowDict[precMz, windowWidth].append(scan)

            queScanValuesDict[scan]['precursorMz'] = precMz
            queScanValuesDict[scan]['windowWideness'] = windowWidth
            queScanValuesDict[scan]['peaksCount'] = spec['peaksCount']
            if 'compensationVoltage' in spec: CV = spec['compensationVoltage']
            else: CV = ''
            queScanValuesDict[scan]['CV'] = CV

    return queWindowDict, queScanValuesDict
示例#22
0
def test_cwt_peak_picking():
    """ This is an old implementation of peak-picking with CWT with peaks correction and plotting.
        Super long (~306 s per scan). Thrashed."""

    spectra = list(mzxml.read(
        '/Users/andreidm/ETH/projects/ms_feature_extractor/data/CsI_NaI_best_conc_mzXML/CsI_NaI_neg_08.mzXML'))

    mid_spectrum = spectra[43]  # nice point on chromatogram

    # mz_region, intensities = extract_mz_region(mid_spectrum,[200,400])
    mz_region, intensities = extract_mz_region(mid_spectrum, [200, 250])

    # peak picking

    plt.plot(mz_region, intensities, lw=1)

    start_time = time.time()

    # peak_indices = signal.find_peaks_cwt(intensities, numpy.arange(1,32), min_snr=1, noise_perc=55)

    # this pair of widths and noise percent allows identification of everything beyond 100 intensity value (visually)
    # the larger widths the less number of relevant peaks identified
    # the larger noise percent the more number of redundant peaks identified
    cwt_peak_indices = signal.find_peaks_cwt(intensities, [0.5], min_snr=1, noise_perc=5)

    corrected_peak_indices = get_corrected_peak_indices(cwt_peak_indices, intensities, step=3, min_intensity=100)

    print('\n', time.time() - start_time, "seconds elapsed\n")

    # print(cwt_peak_indices, mz_region[cwt_peak_indices], intensities[cwt_peak_indices])

    print("\nTotal number of CWT peaks = ", len(cwt_peak_indices))
    print("\nTotal number of corrected peaks = ", len(corrected_peak_indices))

    plt.plot(mz_region[cwt_peak_indices], intensities[cwt_peak_indices], 'gx', lw=1)

    plt.plot(mz_region[corrected_peak_indices], intensities[corrected_peak_indices], 'r.', lw=1)

    plt.show()
示例#23
0
def readXML(bsaMzXmlPath):

    dtype = [('scanNo', int), ('pcCharge', int), ('pcMass', float)]

    with mzxml.read(bsaMzXmlPath) as spectra:
        specInfo = np.array([], dtype)
        specDict = {}
        for spectrum in spectra:
            # print(spectrum)
            # break
            # precurMass.append(spectrum['precursorMz'][0]['precursorMz'])
            # precurChag.append(spectrum['precursorMz'][0]['precursorCharge'])
            # specDict[spectrum['num']] = np.insert(spectrum['m/z array'], 0, 42.011)
            spec = np.array([(int(spectrum['num']),\
                              int(spectrum['precursorMz'][0]['precursorCharge']),
                              spectrum['precursorMz'][0]['precursorCharge'])], dtype)

            specDict[int(spectrum['num'])] = spectrum['m/z array']

            # if int(spectrum['num']) == 36781 :
            #     print(spectrum)
    return specDict
示例#24
0
def average_ms1(input_filename,
                output_filename=None,
                bin_width=1.0,
                format="csv"):
    mass_list = []
    intensity_list = []

    filename, file_extension = os.path.splitext(input_filename)

    if file_extension == ".mzXML":
        spectra = mzxml.read(input_filename,
                             read_schema=True)  #type is pyteomics mzxml
    if file_extension == ".mzML":
        spectra = mzml.read(input_filename,
                            read_schema=True)  #type is pyteomics mzxml

    peaks_list = []

    for element in spectra:
        if "msLevel" in element:
            mslevel = element["msLevel"]
        if "ms level" in element:
            mslevel = element["ms level"]

        mlist = copy.deepcopy(element['m/z array'])
        inten = copy.deepcopy(element['intensity array'])

        if mslevel != 2:
            peaks_list += zip(mlist, inten)

    numpy_vector = vectorize_peaks(peaks_list, 2000, bin_width)

    if output_filename != None:
        dt = pd.DataFrame(data=numpy_vector)
        dt.to_csv(output_filename, mode='a', index=True)

    return numpy_vector
示例#25
0
def mzxml_import_untargeted(mdf, mzxml_dir, compound_col='compound'):

    # create file_list from mdf using regexp search for date followed by _
    file_list = mdf.filter(regex='\d{4,8}[_]').columns
    file_list = [s + '.mzxml' for s in file_list]
    print(file_list)

    ### below commented code will search all mzxml files in a folder
    #file_list = os.listdir(mzxml_dir)
    #file_list = [f for f in file_list if ('.mzXML' in f)]
    #print(file_list)

    dataframe_list = []
    compound = compound_col

    compound = mdf[compound_col]

    medMz = mdf['medMz']
    medRt = mdf['medRt']

    ## mdf is a matrix with the relevant info for getting list
    mdf = pd.concat([compound, medMz, medRt], axis=1)

    for file_to_open in file_list:

        data_path_1 = os.path.join(mzxml_dir, file_to_open)

        ### Search spectra in .mzxml file for precursor ions matching Maven list (in mdf matrix)

        # Create empty list to store results
        results = []
        try:
            # Load mass spec file using mzxml - returns a generator with results
            ms_file = mzxml.read(data_path_1,
                                 read_schema=True,
                                 iterative=False,
                                 use_index=False,
                                 dtype=None)

            print(file_to_open + ' : Processing...')

            # Iterate through elements in mass spec data generator
            for i, spectra in enumerate(ms_file):

                # iterate through m/z values to look for in each mass spec peak entry
                for k, mz_target in enumerate(mdf[mdf.columns[1]]):
                    #rt_target = mdf.iloc[k,2]

                    # 'precursorMz' in keys indicate a 2nd-order mass spec result with a 2nd smaller dictionary as a value
                    if 'precursorMz' in spectra.keys():
                        mz_measured = spectra['precursorMz'][0]['precursorMz']

                        # if one of the recorded peaks is within 20ppm of the reference value,
                        # extract the information in that entry and append to results

                        # combine m/z array and intensity array (mzarray), also take relative intensity and sort descending (mzarray_norm)
                        if (abs((mz_measured - mz_target) / mz_target) <
                                spectra_threshold) and (
                                    abs(mdf.iloc[k, 2] -
                                        spectra['retentionTime']) < rt_thresh):
                            #if abs((mz_measured-mz_target)/mz_target)<spectra_threshold and abs(spectra['retentionTime']-mdf.iloc[k,2])<rt_thresh :
                            q = spectra['intensity array']
                            maxvali = np.amax(q, axis=0)
                            q_norm = q / maxvali
                            r = spectra['m/z array']
                            mzarray = np.array(list(zip(r, q)))
                            mzarray_norm = np.array(list(zip(r, q_norm)))
                            mzarray_norm = mzarray_norm[(
                                -mzarray_norm[:, 1]).argsort()]

                            result_dict = {
                                'precursorMz_m':
                                spectra['precursorMz'][0]['precursorMz'],
                                'retentionTime_m':
                                spectra['retentionTime'],
                                'mzarray':
                                mzarray,
                                'mzarray_norm':
                                mzarray_norm,
                                'compound':
                                mdf.iloc[k]['compound']
                            }
                            results.append(pd.Series(result_dict))

            # concatenate list of results into a single dataframe
            results = pd.DataFrame(results)

            # preview results
            #print('Done')
            #results.head()

            if len(results) == 0: continue

            #index mdf and results to compound and then combine
            results_c = results.set_index('compound')
            mdf_c = mdf.copy()
            mdf_c = mdf_c.set_index('compound')

            #this combines
            data = mdf_c.join(results_c)

            # look for specific fragments in each m/z array

            #for ref in ref_array:
            #    data[str(ref)] = data['mzarray_norm'].apply(check_in_array, ref_value = ref)
            data["sample"] = file_to_open

            dataframe_list.append(data)
            group_df = pd.concat(dataframe_list)
            ## NOTE: when same compound names are included with different RTs, some spectra match to wrong RT
            ## below filter removes spectra matched with incorrect RT
            ## remove spectra that are not within rt range
            group_df['diff'] = group_df['medRt'] - group_df['retentionTime_m']
            group_df = group_df.loc[~(abs(group_df['diff']) >= 1)]
            group_df = group_df.drop(['diff'], axis=1)

        except:
            print(file_to_open + ' not found')
            continue

    return group_df
示例#26
0
 def test_read_dtype(self):
     dtypes = {'m/z array': np.float32, 'intensity array': np.int32}
     with read(self.path, dtype=dtypes) as f:
         for spec in f:
             for k, v in dtypes.items():
                 self.assertEqual(spec[k].dtype, v)
def perform_spectra_pooling_and_analysis(querySpectraFile, outFile, lib,
                                         tolerance, maxQuerySpectraToPool,
                                         corrected, histFile):

    smf.print_milestone('Begin Grouping Scans by m/z Windows:')
    queWindowDict, queScanValuesDict = pool_scans_by_mz_windows(
        querySpectraFile)

    print('Number of Unpooled MS/MS Query Spectra: ' +
          str(len(queScanValuesDict)))
    print('Number of Pooled MS/MS Query Spectra/Mz Windows: ' +
          str(len(queWindowDict)),
          flush=True)

    # To enhance the print experience, status prints will be given at intervals tailored to the number of identified windows.
    #  example: if there are 1-99 pooled query spectra, print statements are made after every pooled query spectra analysis is complete.
    #           if there are 100-999, print after every 10 pooled spectra. And so on.
    printFriendlyCounter = 100
    while printFriendlyCounter < len(queWindowDict):
        printFriendlyCounter *= 10
    printFriendlyCounter /= 100

    allLibKeys, libIdToKeyDict, libIdToDecoyDict = gather_library_metadata(lib)
    allSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher(
    )
    numWindowsAnalyzed = 0

    prevtime = timer()
    smf.print_milestone('Begin Pooled Spectra Analysis:')
    with mzxml.read(querySpectraFile, use_index=True) as spectra:

        for precMz_win, scans in queWindowDict.items():
            top_mz = precMz_win[0] + precMz_win[1] / 2
            bottom_mz = precMz_win[0] - precMz_win[1] / 2
            libKeys = identify_lib_spectra_in_window(top_mz, bottom_mz,
                                                     allLibKeys)
            if len(libKeys) == 0: continue
            pooledLibSpectra = pool_lib_spectra(lib, libKeys)
            pooledQueSpectra = []

            for i in range(len(scans)):
                scanNumber = scans[i]
                queSpectrum = spectra.get_by_id(scanNumber)
                pooledQueSpectra += smf.format_spectra_for_pooling(
                    queSpectrum, scanNumber)

                if (i % maxQuerySpectraToPool == 0
                        and i != 0) or i == len(scans) - 1:
                    pooledQueSpectra.sort()
                    windowSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher(
                    )
                    windowSpectraMatches.compare_spectra(
                        pooledLibSpectra, pooledQueSpectra, tolerance,
                        libIdToDecoyDict)
                    allSpectraMatches.extend_all_spectra(windowSpectraMatches)
                    pooledQueSpectra.clear()

            numWindowsAnalyzed += 1
            if numWindowsAnalyzed % printFriendlyCounter == 0:
                time = timer()
                print('\nNumber of Pooled Experimental Spectra Analyzed: ' +
                      str(numWindowsAnalyzed))
                print('Number of Spectra in Current Pooled Spectra: ' +
                      str(len(scans)))
                print('Time Since Last Checkpoint: ' +
                      str(round(time - prevtime, 2)) + ' Seconds',
                      flush=True)
                prevtime = time

    smf.print_milestone('Begin FDR Analysis:')
    maccCutoff = allSpectraMatches.find_score_fdr_cutoff()

    if corrected != -1:
        smf.print_milestone('Begin Correction Process:')
        allSpectraMatches.filter_by_corrected_ppm_window(
            corrected, maccCutoff, histFile)

        smf.print_milestone('Begin Corrected FDR Analysis:')
        maccCutoff = allSpectraMatches.find_score_fdr_cutoff()

    smf.print_milestone('\nBegin Writing to File: ')
    allSpectraMatches.write_output(outFile, querySpectraFile, maccCutoff,
                                   queScanValuesDict, libIdToKeyDict, lib)
from pyteomics import mzxml
from src.msfe import ms_operator

annotated_peaks = [[204.92, 204.94], [205.15, 205.17], [205.92, 205.94],
                   [207.055, 207.075], [207.092, 207.110], [208.085, 208.10],
                   [208.135, 208.150], [214.99, 215.015], [216.93, 216.95],
                   [239.055, 239.075]]

expected_peaks = [[126.90, 126.92], [139.01, 139.03], [276.79, 276.81],
                  [271.84, 271.85], [1047.9, 1047.92], [1048.9, 1048.92]]

spectra = list(
    mzxml.read(
        '/Users/andreidm/ETH/projects/ms_feature_extractor/data/CsI_NaI_best_conc_mzXML/CsI_NaI_neg_08.mzXML'
    ))
mid_spectrum = spectra[43]  # nice point on chromatogram

accurate_peak_locations = []

for mz_region in expected_peaks:
    print(mz_region, "is being processed...")
    accurate_peak_locations.append(
        ms_operator.locate_annotated_peak(mz_region, mid_spectrum))

print("Done!")
print()
print(accurate_peak_locations)
示例#29
0
from pyteomics import mzxml
import sys
from itertools import groupby
import collections
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns

scans = []
sample = sys.argv[1]

r = mzxml.read(sample)
while True:
    try:
        scans.append(r.next())
    except:
        break
freq_list = []
print "making frequency list"
for scan in scans:
    if scan['msLevel'] == '2':
        base_mz = int(scan['precursorMz'][0]['precursorMz'] * 10000)
        precursor_intensity = scan['precursorMz'][0]['precursorIntensity']
        freq_list.append(base_mz)
freq_list = sorted(freq_list)
d = {x: freq_list.count(x) for x in freq_list}
od = sorted(d.items(), key=lambda x: x[1])
print freq_list

#sns.distplot(d.keys())
plt.hist([item / float(10000) for item in freq_list], bins=len(freq_list))
 def __init__(self, mz_file):
     self._file_path = mz_file
     self.data = mzxml.read(mz_file, use_index=True)