Exemplo n.º 1
0
def cal_nnls(LibIntensity, MS2Intensity, penalty):
    RowIndex = list(range(len(LibIntensity) + 1))
    ColIndex = [0] * (len(LibIntensity) + 1)
    LibIntensity.append(penalty)

    MS2Intensity.append(0)
    MS2Intensity = np.array(MS2Intensity)

    LibraryVector = sparse.coo_matrix((LibIntensity, (RowIndex, ColIndex)))
    LibraryCoeffs = sparse_nnls.lsqnonneg(LibraryVector, MS2Intensity,
                                          {'show_progress': False})
    LibraryCoeffs = LibraryCoeffs['x']
    LibraryCoeffs = LibraryCoeffs[0]

    return LibraryCoeffs
Exemplo n.º 2
0
def RegressSpectraOntoLibrary(DIASpectraIterator, Library, tol,
                              maxWindowOffset):

    RefSpectraLibrary = Library.value

    for DIASpectrum in DIASpectraIterator:

        precMZ = float(DIASpectrum[1])
        precRT = float(DIASpectrum[2])  #MS2 scan retention time, in minutes
        index = DIASpectrum[3]
        windowWidth = DIASpectrum[4]

        DIASpectrum = np.array(DIASpectrum[0])

        LibraryCoeffs = []

        if len(DIASpectrum.shape) == 2:

            if windowWidth > 0:
                CandidateRefSpectraLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if abs(float(spectrum['PrecursorMZ']) -
                           precMZ) < windowWidth / 2
                ]
                MassWindowCandidates = [
                    key for key, spectrum in RefSpectraLibrary.iteritems()
                    if abs(float(spectrum['PrecursorMZ']) -
                           precMZ) < windowWidth / 2
                ]
            else:
                CandidateRefSpectraLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if float(spectrum['PrecursorMZ']) > precMZ -
                    maxWindowOffset / 2
                ]
                MassWindowCandidates = [
                    key for key, spectrum in RefSpectraLibrary.iteritems()
                    if float(spectrum['PrecursorMZ']) > precMZ -
                    maxWindowOffset / 2
                ]

            #MERGING OF POINTS IN ACQUIRED SPECTRUM WITH NEARBY M/Z COORDINATES
            MergedDIASpecCoordIndices = np.searchsorted(
                DIASpectrum[:, 0] + tol * DIASpectrum[:, 0], DIASpectrum[:, 0])
            MergedDIASpecCoords = DIASpectrum[
                np.unique(MergedDIASpecCoordIndices), 0]
            MergedDIASpecIntensities = [
                np.mean(
                    DIASpectrum[np.where(MergedDIASpecCoordIndices == i)[0],
                                1])
                for i in np.unique(MergedDIASpecCoordIndices)
            ]
            DIASpectrum = np.array(
                (MergedDIASpecCoords, MergedDIASpecIntensities)).transpose()

            #FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM
            CentroidBreaks = np.concatenate(
                (DIASpectrum[:, 0] - tol * DIASpectrum[:, 0],
                 DIASpectrum[:, 0] + tol * DIASpectrum[:, 0]))
            CentroidBreaks = np.sort(CentroidBreaks)

            LocateReferenceCoordsInDIA = [
                np.searchsorted(CentroidBreaks, M[:, 0])
                for M in CandidateRefSpectraLibrary
            ]

            TopTenPeaksCoordsInDIA = [
                np.searchsorted(
                    CentroidBreaks,
                    M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0])
                for M in CandidateRefSpectraLibrary
            ]
            ReferencePeaksInDIA = [
                i for i in range(len(MassWindowCandidates))
                if len([a
                        for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5
            ]  #min(3,CandidateRefSpectraLibrary[i].shape[0])]
            ProportionOfReferencePeaksInDIA = [
                len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) /
                CandidateRefSpectraLibrary[i].shape[0]
                for i in range(len(MassWindowCandidates))
            ]

            RefPeptideCandidatesLocations = [
                LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA
            ]
            RefPeptideCandidateList = [
                CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA
            ]
            RefPeptideCandidates = [
                MassWindowCandidates[i] for i in ReferencePeaksInDIA
            ]
            NormalizedRefPeptideCandidateList = [
                M[:, 1] / sum(M[:, 1]) for M in RefPeptideCandidateList
            ]

            RefSpectraLibrarySparseRowIndices = (np.array([
                i for v in RefPeptideCandidatesLocations
                for i in v if i % 2 == 1
            ]) + 1) / 2
            RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1  #Respect the 0-indexing
            RefSpectraLibrarySparseColumnIndices = np.array([
                i for j in range(len(RefPeptideCandidates))
                for i in [j] * len([
                    k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1
                ])
            ])
            RefSpectraLibrarySparseMatrixEntries = np.array([
                NormalizedRefPeptideCandidateList[k][i]
                for k in range(len(NormalizedRefPeptideCandidateList))
                for i in range(len(NormalizedRefPeptideCandidateList[k]))
                if RefPeptideCandidatesLocations[k][i] % 2 == 1
            ])

            if (len(RefSpectraLibrarySparseRowIndices) > 0
                    and len(RefSpectraLibrarySparseColumnIndices) > 0
                    and len(RefSpectraLibrarySparseMatrixEntries) > 0):

                UniqueRowIndices = [
                    i for i in set(RefSpectraLibrarySparseRowIndices)
                ]
                UniqueRowIndices.sort()

                DIASpectrumIntensities = DIASpectrum[
                    UniqueRowIndices,
                    1]  #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry
                DIASpectrumIntensities = np.append(DIASpectrumIntensities, [
                    0
                ])  #Add a zero to the end of the DIA data vector to penalize
                #peaks of library spectra not present in the DIA spectrum

                #AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM
                ReferencePeaksNotInDIA = np.array([
                    k for v in RefPeptideCandidatesLocations
                    for k in range(len(v)) if v[k] % 2 == 0
                ])
                SparseColumnIndicesForPeaksNotInDIA = np.arange(
                    len(RefPeptideCandidates))
                NumRowsOfLibraryMatrix = max(UniqueRowIndices)
                SparseRowIndicesForPeaksNotInDIA = [
                    NumRowsOfLibraryMatrix + 1
                ] * len(SparseColumnIndicesForPeaksNotInDIA)
                #Duplicate (i,j) entries are summed together, yielding total ion intensities
                SparseMatrixEntriesForPeaksNotInDIA = np.array([
                    np.sum([
                        NormalizedRefPeptideCandidateList[j][k] for k in range(
                            len(NormalizedRefPeptideCandidateList[j]))
                        if RefPeptideCandidatesLocations[j][k] % 2 == 0
                    ]) for j in range(len(NormalizedRefPeptideCandidateList))
                ])

                SparseRowIndices = np.append(RefSpectraLibrarySparseRowIndices,
                                             SparseRowIndicesForPeaksNotInDIA)
                SparseColumnIndices = np.append(
                    RefSpectraLibrarySparseColumnIndices,
                    SparseColumnIndicesForPeaksNotInDIA)
                SparseMatrixEntries = np.append(
                    RefSpectraLibrarySparseMatrixEntries,
                    SparseMatrixEntriesForPeaksNotInDIA)

                SparseRowIndices = stats.rankdata(
                    SparseRowIndices, method='dense'
                ).astype(
                    int
                ) - 1  #Renumber the row indices according to the projected spectrum,
                #respecting the 0-indexing
                LibrarySparseMatrix = sparse.coo_matrix(
                    (SparseMatrixEntries, (SparseRowIndices,
                                           SparseColumnIndices)))
                LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix,
                                                      DIASpectrumIntensities,
                                                      {'show_progress': False})
                LibraryCoeffs = LibraryCoeffs['x']

        NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0]
        NonzeroCoeffsAboveThreshold = NonzeroCoeffs

        Output = [[0, index, 0, 0, 0, 0]]

        if len(NonzeroCoeffs) > 0:
            RefSpectraIDs = [
                RefPeptideCandidates[j]
                for j in range(len(RefPeptideCandidates))
                if LibraryCoeffs[j] != 0
            ]
            Output = [[
                NonzeroCoeffsAboveThreshold[i], index, RefSpectraIDs[i][0],
                RefSpectraIDs[i][1], precMZ, precRT
            ] for i in range(len(NonzeroCoeffsAboveThreshold))]

        yield Output
Exemplo n.º 3
0
def RegressSpectraOntoLibraryWithDecoys(DIASpectraIterator, Library, tol,
                                        maxWindowOffset):

    RefSpectraLibrary = Library.value

    for DIASpectrum in DIASpectraIterator:

        precMZ = float(DIASpectrum[1])
        precRT = float(DIASpectrum[2])  #MS2 scan retention time, in minutes
        index = DIASpectrum[3]
        windowWidth = DIASpectrum[4]

        DIASpectrum = np.array(DIASpectrum[0])

        LibraryCoeffs = []

        if len(DIASpectrum.shape) == 2:

            if windowWidth > 0:
                CandidateRefSpectraLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if abs(float(spectrum['PrecursorMZ']) -
                           precMZ) < windowWidth / 2
                ]
                MassWindowCandidates = [
                    key for key, spectrum in RefSpectraLibrary.iteritems()
                    if abs(float(spectrum['PrecursorMZ']) -
                           precMZ) < windowWidth / 2
                ]
                CandidateDecoyLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if windowWidth /
                    2 <= abs(float(spectrum['PrecursorMZ']) -
                             precMZ) <= windowWidth
                ]
                MassWindowDecoyCandidates = [
                    ("DECOY_" + key[0], key[1])
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if windowWidth /
                    2 <= abs(float(spectrum['PrecursorMZ']) -
                             precMZ) <= windowWidth
                ]
            else:
                CandidateRefSpectraLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if float(spectrum['PrecursorMZ']) > precMZ -
                    maxWindowOffset / 2
                ]
                MassWindowCandidates = [
                    key for key, spectrum in RefSpectraLibrary.iteritems()
                    if float(spectrum['PrecursorMZ']) > precMZ -
                    maxWindowOffset / 2
                ]
                CandidateDecoyLibrary = [
                    spectrum['Spectrum']
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if precMZ - maxWindowOffset <= float(
                        spectrum['PrecursorMZ']) <= precMZ -
                    maxWindowOffset / 2
                ]
                MassWindowDecoyCandidates = [
                    ("DECOY_" + key[0], key[1])
                    for key, spectrum in RefSpectraLibrary.iteritems()
                    if precMZ - maxWindowOffset <= float(
                        spectrum['PrecursorMZ']) <= precMZ -
                    maxWindowOffset / 2
                ]

            #FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM
            CentroidBreaks = np.concatenate(
                (DIASpectrum[:, 0] - tol * DIASpectrum[:, 0],
                 DIASpectrum[:, 0] + tol * DIASpectrum[:, 0]))
            CentroidBreaks.sort()

            LocateReferenceCoordsInDIA = [
                np.searchsorted(CentroidBreaks, M[:, 0])
                for M in CandidateRefSpectraLibrary
            ]
            #Hard cutoff - at least 5 of the 10 most intense peaks (or all peaks if there are fewer than 3) of reference spectrum must appear in acquired spectrum

            TopTenPeaksCoordsInDIA = [
                np.searchsorted(
                    CentroidBreaks,
                    M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0])
                for M in CandidateRefSpectraLibrary
            ]
            ReferencePeaksInDIA = [
                i for i in range(len(MassWindowCandidates))
                if len([a
                        for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5
            ]  #min(3,CandidateRefSpectraLibrary[i].shape[0])]

            #SHIFT ALL FRAGMENT ION PEAKS OF ALL DECOY SPECTRA BY 20 M/Z TO ENSURE DISSIMILARITY FROM REAL SPECTRA
            LocateDecoyCoordsInDIA = [
                np.searchsorted(CentroidBreaks, M[:, 0] + 20)
                for M in CandidateDecoyLibrary
            ]
            TopTenPeaksCoordsInDIA = [
                np.searchsorted(
                    CentroidBreaks,
                    M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0] + 20)
                for M in CandidateDecoyLibrary
            ]
            DecoyPeaksInDIA = [
                i for i in range(len(MassWindowDecoyCandidates))
                if len([a
                        for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5
            ]  #min(3,CandidateRefSpectraLibrary[i].shape[0])]

            RefPeptideCandidatesLocations = [
                LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA
            ]
            RefPeptideCandidateList = [
                CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA
            ]
            RefPeptideCandidates = [
                MassWindowCandidates[i] for i in ReferencePeaksInDIA
            ]
            NormalizedRefPeptideCandidateList = [
                M[:, 1] / sum(M[:, 1]) for M in RefPeptideCandidateList
            ]

            DecoyCandidatesLocations = [
                LocateDecoyCoordsInDIA[i] for i in DecoyPeaksInDIA
            ]
            DecoyCandidateList = [
                CandidateDecoyLibrary[i] for i in DecoyPeaksInDIA
            ]
            DecoyCandidates = [
                MassWindowDecoyCandidates[i] for i in DecoyPeaksInDIA
            ]
            NormalizedDecoyCandidateList = [
                M[:, 1] / sum(M[:, 1]) for M in DecoyCandidateList
            ]

            RefSpectraLibrarySparseRowIndices = (np.array([
                i for v in RefPeptideCandidatesLocations
                for i in v if i % 2 == 1
            ]) + 1) / 2
            RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1  #Respect the 0-indexing
            RefSpectraLibrarySparseColumnIndices = np.array([
                i for j in range(len(RefPeptideCandidates))
                for i in [j] * len([
                    k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1
                ])
            ])
            RefSpectraLibrarySparseMatrixEntries = np.array([
                NormalizedRefPeptideCandidateList[k][i]
                for k in range(len(NormalizedRefPeptideCandidateList))
                for i in range(len(NormalizedRefPeptideCandidateList[k]))
                if RefPeptideCandidatesLocations[k][i] % 2 == 1
            ])

            if (len(RefSpectraLibrarySparseRowIndices) > 0
                    and len(RefSpectraLibrarySparseColumnIndices) > 0
                    and len(RefSpectraLibrarySparseMatrixEntries) > 0):
                DecoyLibrarySparseRowIndices = (np.array([
                    i for v in DecoyCandidatesLocations
                    for i in v if i % 2 == 1
                ]) + 1) / 2
                DecoyLibrarySparseRowIndices = DecoyLibrarySparseRowIndices - 1  #Respect the 0-indexing
                DecoyLibrarySparseColumnIndices = max(
                    RefSpectraLibrarySparseColumnIndices
                ) + 1 + np.array([
                    i for j in range(len(DecoyCandidates)) for i in [j] *
                    len([k for k in DecoyCandidatesLocations[j] if k % 2 == 1])
                ])
                DecoyLibrarySparseMatrixEntries = np.array([
                    NormalizedDecoyCandidateList[k][i]
                    for k in range(len(NormalizedDecoyCandidateList))
                    for i in range(len(DecoyCandidatesLocations[k]))
                    if DecoyCandidatesLocations[k][i] % 2 == 1
                ])

                UniqueRowIndices = np.unique(
                    np.concatenate((RefSpectraLibrarySparseRowIndices,
                                    DecoyLibrarySparseRowIndices)))
                UniqueRowIndices = np.array(np.sort(UniqueRowIndices),
                                            dtype=int)

                DIASpectrumIntensities = DIASpectrum[
                    UniqueRowIndices,
                    1]  #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry

                DIASpectrumIntensities = np.append(DIASpectrumIntensities, [
                    0
                ])  #Add a zero to the end of the DIA data vector to penalize
                #peaks of library spectra not present in the DIA spectrum

                #AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM
                ReferencePeaksNotInDIA = np.array([
                    k for v in RefPeptideCandidatesLocations
                    for k in range(len(v)) if v[k] % 2 == 0
                ])
                SparseColumnIndicesForPeaksNotInDIA = np.arange(
                    len(RefPeptideCandidates))
                NumRowsOfLibraryMatrix = max(UniqueRowIndices)
                SparseRowIndicesForPeaksNotInDIA = [
                    NumRowsOfLibraryMatrix + 1
                ] * len(SparseColumnIndicesForPeaksNotInDIA)
                #Duplicate (i,j) entries are summed together, yielding total ion intensities
                SparseMatrixEntriesForPeaksNotInDIA = np.array([
                    np.sum([
                        NormalizedRefPeptideCandidateList[j][k] for k in range(
                            len(NormalizedRefPeptideCandidateList[j]))
                        if RefPeptideCandidatesLocations[j][k] % 2 == 0
                    ]) for j in range(len(NormalizedRefPeptideCandidateList))
                ])

                RefSpectraLibrarySparseRowIndices = np.append(
                    RefSpectraLibrarySparseRowIndices,
                    SparseRowIndicesForPeaksNotInDIA)
                RefSpectraLibrarySparseColumnIndices = np.append(
                    RefSpectraLibrarySparseColumnIndices,
                    SparseColumnIndicesForPeaksNotInDIA)
                RefSpectraLibrarySparseMatrixEntries = np.append(
                    RefSpectraLibrarySparseMatrixEntries,
                    SparseMatrixEntriesForPeaksNotInDIA)

                DecoyPeaksNotInDIA = np.array([
                    k for v in DecoyCandidatesLocations for k in range(len(v))
                    if v[k] % 2 == 0
                ])
                SparseColumnIndicesForDecoyPeaksNotInDIA = np.arange(
                    len(DecoyCandidates))
                NumRowsOfLibraryMatrix = max(UniqueRowIndices)
                SparseRowIndicesForDecoyPeaksNotInDIA = [
                    NumRowsOfLibraryMatrix + 1
                ] * len(SparseColumnIndicesForDecoyPeaksNotInDIA)
                #Duplicate (i,j) entries are summed together, yielding total ion intensities
                SparseMatrixEntriesForDecoyPeaksNotInDIA = np.array([
                    np.sum([
                        NormalizedDecoyCandidateList[j][k]
                        for k in range(len(NormalizedDecoyCandidateList[j]))
                        if DecoyCandidatesLocations[j][k] % 2 == 0
                    ]) for j in range(len(NormalizedDecoyCandidateList))
                ])

                DecoyLibrarySparseRowIndices = np.append(
                    DecoyLibrarySparseRowIndices,
                    SparseRowIndicesForDecoyPeaksNotInDIA)
                DecoyLibrarySparseColumnIndices = np.append(
                    DecoyLibrarySparseColumnIndices,
                    max(RefSpectraLibrarySparseColumnIndices) +
                    SparseColumnIndicesForDecoyPeaksNotInDIA + 1)
                DecoyLibrarySparseMatrixEntries = np.append(
                    DecoyLibrarySparseMatrixEntries,
                    SparseMatrixEntriesForDecoyPeaksNotInDIA)

                SparseRowIndices = np.concatenate(
                    (RefSpectraLibrarySparseRowIndices,
                     DecoyLibrarySparseRowIndices))
                SparseColumnIndices = np.concatenate(
                    (RefSpectraLibrarySparseColumnIndices,
                     DecoyLibrarySparseColumnIndices))
                SparseMatrixEntries = np.concatenate(
                    (RefSpectraLibrarySparseMatrixEntries,
                     DecoyLibrarySparseMatrixEntries))

                SparseRowIndices = stats.rankdata(
                    SparseRowIndices, method='dense'
                ).astype(
                    int
                ) - 1  #Renumber the row indices according to the projected spectrum,
                #respecting the 0-indexing

                LibrarySparseMatrix = sparse.coo_matrix(
                    (SparseMatrixEntries, (SparseRowIndices,
                                           SparseColumnIndices)))
                LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix,
                                                      DIASpectrumIntensities,
                                                      {'show_progress': False})
                LibraryCoeffs = LibraryCoeffs['x']

        NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0]
        NonzeroCoeffsAboveThreshold = NonzeroCoeffs

        Output = [[0, index, 0, 0, 0, 0]]

        if len(NonzeroCoeffs) > 0:
            RefSpectraIDs = [
                RefPeptideCandidates[j]
                for j in range(len(RefPeptideCandidates))
                if LibraryCoeffs[j] != 0
            ]
            DecoyIDs = [
                DecoyCandidates[j] for j in range(len(DecoyCandidates))
                if LibraryCoeffs[max(RefSpectraLibrarySparseColumnIndices) +
                                 1 + j] != 0
            ]

            RefSpectraIDs = RefSpectraIDs + DecoyIDs
            Output = [[
                NonzeroCoeffsAboveThreshold[i], index, RefSpectraIDs[i][0],
                RefSpectraIDs[i][1], precMZ, precRT
            ] for i in range(len(NonzeroCoeffsAboveThreshold))]

        yield Output