Пример #1
0
def adjust_prec_mz(mgf_file, error, out_path):
    outfile = os.path.join(out_path, 'recal_' + os.path.split(mgf_file)[1])
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    elif os.path.isfile(outfile):
        raise Exception('File %s already exists!' % outfile)

    ms2_spectra = ProteoFileReader.read_mgf(mgf_file)

    for spectrum in ms2_spectra:
        spectrum.pepmz = spectrum.getPrecursorMZ() / (1 - error / 10.0 ** 6)

    ProteoFileReader.write_mgf(ms2_spectra, outfile)
Пример #2
0
def generate_cihcd_spectra(mzml_file):
    """

    """

    mzml_reader = mzml.read(mzml_file)
    cihcd_spectra = []

    n = 0
    for spectrum in mzml_reader:
        if spectrum['ms level'] == 3:
            n += 1
            filter_str = spectrum['scanList']['scan'][0]['filter string']
            try:
                detector_str = re.search("^(FT|IT)", filter_str).groups()[0]
                frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str)
                precursor_mz_groups = re.findall("([0-9.]+)@", filter_str)
            except AttributeError:
                raise StandardError("filter string parse error: %s" % filter_str)

            ms2_id = spectrum['precursorList']['precursor'][0]['spectrumRef']

            title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id'] + " ms2_scanId=" + ms2_id
            rt = spectrum['scanList']['scan'][0]['scan start time'] * 60

            pre_mz = precursor_mz_groups[0]     # take ms2 precursor as precursor
            pre_int = -1
            pre_z = -1
            peaks = zip(spectrum['m/z array'], spectrum['intensity array'])

            ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks)

            cihcd_spectra.append(ms2class_spectrum)

    return cihcd_spectra
Пример #3
0
    def deisotope_spectra(self,
                          infile,
                          in_type="MGF",
                          n_jobs=-1,
                          return_type="spectrum",
                          show_progress=False):
        """
        Function to deisotope spectra
        """

        #process a MGF
        if in_type == "MGF":
            MGF_file = PFR.MGF_Reader()
            MGF_file.load(infile)

            results_store = Parallel(n_jobs=n_jobs)\
                (delayed(self.parallel_helper)(spectrum, return_type, show_progress, ii) for ii, spectrum in enumerate(MGF_file))

        elif in_type.lower() == "mzml":
            mzml_file = oms.MzMLFile()
            exp = oms.MSExperiment()
            mzml_file.load(infile, exp)

            #get the MS2 spectra
            spectra_PFR = []
            for spectrum in exp:
                if spectrum.getMSLevel() == 2:
                    spectra_PFR.append(
                        PFR.MS2_spectrum(
                            spectrum.getNativeID(), spectrum.getRT(),
                            spectrum.getPrecursors()[0].getMZ(),
                            spectrum.getPrecursors()[0].getIntensity(),
                            spectrum.getPrecursors()[0].getCharge(),
                            np.matrix(spectrum.get_peaks()).transpose()))

            results_store = Parallel(n_jobs=n_jobs)\
                (delayed(self.parallel_helper)(spectrum, return_type, show_progress, ii) for ii, spectrum in enumerate(spectra_PFR))
        else:
            print("In type is not supported.")
            sys.exit()

        if return_type == "df":
            results_store_df = pd.concat(results_store)
            return (results_store_df)

        else:
            return (results_store)
def adjust_prec_mz(mgf_file, ms1_error, ms2_error, outpath):
    outfile = os.path.join(outpath, 'recal_' + os.path.split(mgf_file)[1])
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    elif os.path.isfile(outfile):
        raise Exception('File %s already exists!' % outfile)
    ms2_spectra = ProteoFileReader.read_mgf(mgf_file)

    for spectrum in ms2_spectra:
        # ms1/precursor correction
        spectrum.pepmz = spectrum.getPrecursorMZ() / (
            1 + ms1_error / 10.0**6)  # TODO wrong sign if newer version

        # ms2 peak correction
        ms2_peaks = spectrum.getPeaks()
        for i in range(0, len(ms2_peaks)):
            ms2_peaks[i][0] = ms2_peaks[i][0] / (1 + ms2_error / 10.**6)

        spectrum.peaks = ms2_peaks

    ProteoFileReader.write_mgf(ms2_spectra, outfile)
Пример #5
0
def process_file(filepath,
                 outdir,
                 mscon_settings,
                 split_acq,
                 detector_filter,
                 mscon_exe,
                 cihcd_ms3=False):  #TODO implement option further up
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    conv_cmds = mscon_cmd(filepath=filepath,
                          outdir=outdir,
                          settings=mscon_settings,
                          mgf=not split_acq)

    if len(conv_cmds) > 0:
        msconvert = subprocess.Popen([mscon_exe] + conv_cmds)
        msconvert.communicate()

    filename = os.path.split(filepath)[1]
    mzml_file = os.path.join(outdir, filename[:filename.rfind('.')] + '.mzML')

    if cihcd_ms3:
        cihcd_spectra = generate_cihcd_spectra(mzml_file)
        ProteoFileReader.write_mgf(
            spectra=cihcd_spectra,
            outfile=os.path.join(
                outdir,
                'CIhcD_ms3_' + filename[:filename.rfind('.')] + '.mgf'))

    if split_acq:
        split_spectra = mzml_to_MS2_spectra(mzml_file, detector_filter)

        ProteoFileReader.write_mgf(
            spectra=split_spectra,
            outfile=os.path.join(outdir,
                                 filename[:filename.rfind('.')] + '.mgf'))
Пример #6
0
def adjust_prec_mz(mgf_file, error, outpath):
    outfile = os.path.join(outpath, 'recal_' + os.path.split(mgf_file)[1])
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    elif os.path.isfile(outfile):
        return
    exp = ProteoFileReader.MGF_Reader()
    exp.load(mgf_file)

    out_writer = open(os.path.join(outfile), "w")
    for spectrum in exp:
        prec_mz_new = spectrum.getPrecursorMass() / (1 - error / 10.**6)
        if sys.version_info.major < 3:
            stavrox_mgf = """
MASS=Monoisotopic
BEGIN IONS
TITLE={}
PEPMASS={} {}
CHARGE={}+
RTINSECONDS={}
{}
END IONS     """.format(
                spectrum.getTitle(), prec_mz_new,
                spectrum.getPrecursorIntensity()
                if spectrum.getPrecursorIntensity() > 0 else 0,
                int(spectrum.charge), spectrum.getRT(), "\n".join([
                    "%s %s" % (i[0], i[1]) for i in spectrum.peaks if i[1] > 0
                ]))
        else:
            stavrox_mgf = """
MASS=Monoisotopic
BEGIN IONS
TITLE={}
PEPMASS={} {}
CHARGE={}+
RTINSECONDS={}
{}
END IONS     """.format(
                spectrum.getTitle(), prec_mz_new,
                spectrum.getPrecursorIntensity()
                if spectrum.getPrecursorIntensity() > 0 else 0,
                int(spectrum.charge), spectrum.getRT(), "\n".join([
                    "%s %s" % (mz, spectrum.peaks[1][i])
                    for i, mz in enumerate(spectrum.peaks[0])
                    if spectrum.peaks[1][i] > 0
                ]))
        out_writer.write(stavrox_mgf)
Пример #7
0
def test():
    #%%
    infile = "data/mscon_PF_20_100_0_B160803_02.mzML"
    exp = PFR.mzMLReader(infile)
    print(exp)

    infile = "data/test.mgf"
    infile = "data/mscon_PF_20_100_0_B160803_02.mgf"

    deisotoper = de.Deisotoper()
    deisotoped_spectra = deisotoper.deisotope_spectra(infile,
                                                      show_progress=True,
                                                      n_jobs=-1)

    #%%
    spectrum = pd.read_csv("data/test.dta2d", sep="\t")
    mz = spectrum["MZ"].values
    intensity = spectrum["INT"].values
    deisotoper = de.Deisotoper()
    G = deisotoper.spec2graph(mz, intensity)
    cluster_ar = deisotoper.extract_isotope_cluster(G, verbose=False)
    cluster_dic_resolved = deisotoper.resolve_ambiguous(
        cluster_ar, mz, intensity)
    cluster_df = de.deisotoper.assemble_spectrum(cluster_dic_resolved, mz,
                                                 intensity, 1337)
    n_accepted = sum([i[0] for i in cluster_dic_resolved.values()])
    n_rejected = len(cluster_dic_resolved) - n_accepted

    #%%
    #plt.bar(mz, intensity)
    verbose = False
    deisotoper = de.Deisotoper()

    # =========================================================================
    #
    # =========================================================================
    #example 1 - pseudo overlapping - charge 2 but 1 also possible
    mz = np.array([0, 100, 250, 700, 700.5, 701, 701.5, 800])
    z_test = 2
    exp1 = AM.averagine_model(700 * z_test, n_peaks=4,
                              only_intensity=True) * 100
    intensity = np.array([10, 10, 10, exp1[0], exp1[1], exp1[2], exp1[3], 10])

    GT = {}
    GT["ratio"] = "1:0"
    GT["Case"] = "test 1 - only one true isotope cluster"
    GT["TestID"] = "1"
    G = deisotoper.spec2graph(mz, intensity)
    de.plot_graph(G)
    cluster_dic = deisotoper.extract_isotope_cluster(G, verbose)
    cluster_dic_resolved = deisotoper.resolve_ambiguous(
        cluster_dic, mz, intensity)

    #plt.bar(mz, intensity, width=0.2)
    #plt.xlim(699, 702)
    #%%
    # =========================================================================
    # example 2 - overlapping charge 1 and 2
    # =========================================================================
    verbose = True
    deisotoper = de.Deisotoper()

    a = time.time()
    mz = np.array([0, 100, 250, 300, 500, 501, 501.5, 502, 503])
    exp1 = AM.averagine_model(
        500, n_peaks=4,
        only_intensity=True) * 100 * 3  #+ #np.random.normal(0, 10, 4)
    exp2 = AM.averagine_model(
        501.5 * 2, n_peaks=3,
        only_intensity=True) * 100 * 6  #+ np.random.normal(0, 10, 3)
    intensity = np.array([
        10, 10, 10, 10, exp1[0], exp1[1] + exp2[0], exp2[1], exp1[2] + exp2[2],
        exp1[3]
    ])

    GT = {}
    GT["ratio"] = "1:2"
    GT["Case"] = "test 2 - Overlapping different charge"
    GT["TestID"] = "2"
    G = deisotoper.spec2graph(mz, intensity)
    de.plot_graph(G)
    cluster_ar = deisotoper.extract_isotope_cluster(G, verbose)
    cluster_dic_resolved = deisotoper.resolve_ambiguous(cluster_ar,
                                                        mz,
                                                        intensity,
                                                        verbose=verbose,
                                                        GT=GT)
    print(cluster_ar)
    b = time.time()
    took = (b - a) / 60.
    print(took * 30000)
Пример #8
0
def split_mzml(mzml_file, detector="all"):
    """
    function to split a mzML file into dict of MS2_Spectra objects (can be written to mgf format)
    by fragmentation method

    Parameters:
    -----------------------------------------
    mzml_file: str,
            path to mzML file

    Return: dict {fragMethod: list(MS2_spectrum)

    """

    mzml_reader = mzml.read(mzml_file)
    ordered_ms2_spectra = {
        "CID": [],
        "HCD": [],
        "ETD": [],
        "ETciD": [],
        "EThcD": [],
        "unknown": []
    }

    n = 0
    for spectrum in mzml_reader:
        if spectrum['ms level'] == 2:
            n += 1
            filter_str = spectrum['scanList']['scan'][0]['filter string']
            try:
                detector_str = re.search("^(FT|IT)", filter_str).groups()[0]
                frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str)
            except AttributeError:
                raise StandardError("filter string parse error: %s" % filter_str)

            if not detector == "all":
                if not detector == detector_str:
                    continue

            title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id']
            rt = spectrum['scanList']['scan'][0]['scan start time'] * 60
            precursor = spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]
            pre_mz = precursor['selected ion m/z']
            try:
                pre_int = precursor['peak intensity']
            except KeyError:
                pre_int = 0
            pre_z = precursor['charge state']
            peaks = zip(spectrum['m/z array'], spectrum['intensity array'])

            ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks)

            frag_methods = [f[0] for f in frag_groups]

            if "etd" in frag_methods:
                if "cid" in frag_methods:
                    ordered_ms2_spectra['ETciD'].append(ms2class_spectrum)
                elif "hcd" in frag_methods:
                    ordered_ms2_spectra['EThcD'].append(ms2class_spectrum)
                else:
                    ordered_ms2_spectra['ETD'].append(ms2class_spectrum)
            elif "cid" in frag_methods:
                ordered_ms2_spectra['CID'].append(ms2class_spectrum)
            elif "hcd" in frag_methods:
                ordered_ms2_spectra['HCD'].append(ms2class_spectrum)
            else:
                ordered_ms2_spectra['unknown'].append(ms2class_spectrum)
    if len(ordered_ms2_spectra['unknown']) > 0:
        raise Warning("The fragmentation method of %i spectra could not be identified" % len(ordered_ms2_spectra['unknown']))

    return {k: v for k, v in ordered_ms2_spectra.items() if len(v) > 0}
Пример #9
0
def mzml_to_MS2_spectra(mzml_file, detector_filter="all"):
    """
    function to split a mzML file into a list of MS2_Spectra objects (can be written to mgf format)
    with fragmentation method and detector type

    Parameters:
    -----------------------------------------
    mzml_file: str,
            path to mzML file
    detector_filter: filter scans by detector type ('all', 'FT', 'IT')

    Return: list(MS2_spectrum)

    """

    mzml_reader = mzml.read(mzml_file)
    sorted_ms2_spectra = []
    unknown_frag_method_count = 0

    n = 0
    for spectrum in mzml_reader:
        if spectrum['ms level'] == 2:
            n += 1
            filter_str = spectrum['scanList']['scan'][0]['filter string']
            try:
                detector_str = re.search("^(FT|IT)", filter_str).groups()[0]
                frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str)
            except AttributeError:
                raise Exception("filter string parse error: %s" % filter_str)

            if not detector_filter == "all":
                if not detector_filter == detector_str:
                    continue

            title = os.path.split(mzml_file)[1].split(
                '.mzML')[0] + " " + spectrum['id']
            rt = spectrum['scanList']['scan'][0]['scan start time'] * 60
            precursor = spectrum['precursorList']['precursor'][0][
                'selectedIonList']['selectedIon'][0]
            pre_mz = precursor['selected ion m/z']
            try:
                pre_int = precursor['peak intensity']
            except KeyError:
                pre_int = 0
            pre_z = precursor['charge state']
            peaks = zip(spectrum['m/z array'], spectrum['intensity array'])

            frag_methods = [f[0] for f in frag_groups]

            if "etd" in frag_methods:
                if "cid" in frag_methods:
                    frag_method = "ETciD"
                elif "hcd" in frag_methods:
                    frag_method = "EThcD"
                else:
                    frag_method = "ETD"
            elif "cid" in frag_methods:
                frag_method = "CID"
            elif "hcd" in frag_methods:
                frag_method = "HCD"
            else:
                frag_method = 'unknown'
                unknown_frag_method_count += 1

            ms2class_spectrum = ProteoFileReader.MS2_spectrum(
                title,
                rt,
                pre_mz,
                pre_int,
                pre_z,
                peaks,
                detector=detector_str,
                fragmethod=frag_method)

            sorted_ms2_spectra.append(ms2class_spectrum)

    if unknown_frag_method_count > 0:
        raise Warning(
            "The fragmentation method of %i spectra could not be identified" %
            unknown_frag_method_count)

    return sorted_ms2_spectra
Пример #10
0
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # TODO change to parallel with manual input of error
        for inputfile in mgf_file_list:
            if 'ms3' in os.path.split(inputfile)[1]:
                continue
            if ms2recal:
                mass_recal_ms2.main(fasta=recal_conf['db'],
                                    xi_cnf=recal_conf['xiconf'],
                                    outpath=outdir,
                                    mgf=inputfile,
                                    threads=str(nthr),
                                    val_input=recal_conf['shift_csv'])
            else:
                mass_recal.main(fasta=recal_conf['db'],
                                xi_cnf=recal_conf['xiconf'],
                                outpath=outdir,
                                mgf=inputfile,
                                threads=str(nthr),
                                val_input=recal_conf['shift_csv'])

        mgf_file_list = [
            os.path.join(os.path.split(x)[0], 'recal_' + os.path.split(x)[1])
            for x in mgf_file_list
        ]

    if split_acq:
        for mgf_file in mgf_file_list:
            ProteoFileReader.split_mgf_methods(mgf_file)