Пример #1
0
    def _correct_psm(self, match):
        """ Correct PSM. """
        # sparated match to consider subsequence only
        sepm = self._parse_matches(match)
        # get combinations and artifacts
        combs, combs_add, artifs, rp = self._restrict_refs(
            sepm.seq_proc,
            sepm.mods_proc,
            nterm=match.nterm_left,
            cterm=match.cterm_left)
        s = "".join(rp)
        # get the combinations of error loss
        candidates, unique_peps = [], set()
        for dm, c in match.delta_mass:
            if abs(dm) <= self.tol:
                candidates.append(Peptide(match.seq, c, match.mods))
                continue

            # all possible corrections
            corrects = self._correct_mass(s, dm, combs, combs_add, artifs)
            for corr in corrects:
                mods_c = []
                if corr.ist_idx is None:
                    seq_c = "".join(rp[corr.rt_idx]).upper()
                    # reset modifications
                    for mod in sepm.mods_proc:
                        if mod.site - 1 in corr.rt_idx:
                            j = int(corr.rt_idx.index(mod.site - 1))
                            mods_c.append(mod._replace(site=j + 1))
                    # set up modification sites from corrections.
                    if corr.mods is not None:
                        mods_c += corr.mods
                else:
                    iseq = np.array(corr.ist_seq, dtype=str)
                    seq_c = "".join(
                        np.insert(rp[corr.rt_idx], corr.ist_idx, iseq))
                    # reset modifications
                    for mod in sepm.mods_proc:
                        if mod.site - 1 in corr.rt_idx:
                            j = corr.rt_idx.index(mod.site - 1) + 1
                            # if residue insert in current subsequence,
                            # reset modification site again.
                            j2 = sum(i < j for i in corr.ist_idx)
                            mods_c.append(mod._replace(site=int(j + j2)))

                seq, mods = self._reconstruct_peptide(sepm, seq_c, mods_c)
                # if the peptide has been identified, ignore it.
                pep = self._combine_mods(seq, mods)
                if pep not in unique_peps:
                    candidates.append(Peptide(seq, c, mods))
                    unique_peps.add(pep)

        return candidates
Пример #2
0
    def _match_syn_peptide(psm, syn_peptides, match=SynMatch):
        """ Match synthetic peptide forcely. """
        pmz_neu = psm.spectrum.prec_mz - constants.FIXED_MASSES["H"]
        # assign mass spectra using synthetic peptides
        matches = []
        for seq, mods in syn_peptides:
            # peptide project
            pep = Peptide(seq, 2, mods)
            mch = PSM(psm.data_id, psm.spec_id, pep, psm.spectrum)
            annotates, _ = mch.denoise_spectrum()

            # get annotations
            n, seq_ion_index = 0, collections.defaultdict(list)
            for ion, (_, ion_index) in annotates.items():
                if ion[0] in "yb" and "-" not in ion and ion.endswith("[+]"):
                    n += 1
                    seq_ion_index[ion[0]].append(ion_index)
            if n == 0:
                continue

            # maximum length of seq tags and the maximum ion index
            n_tag, max_ion_index = [], {"y": 0, "b": 0}
            for ion in seq_ion_index.keys():
                nion = len(seq_ion_index[ion])
                if nion > 1:
                    index_diff, i0 = np.diff(sorted(seq_ion_index[ion])), -1
                    tx, = np.where(index_diff > 1)
                    for i in tx:
                        n_tag.append(i - i0)
                        if i - i0 > 1:
                            max_ion_index[ion] = seq_ion_index[ion][i + 1]
                        i0 = i
                    # the end of the array
                    if nion - i0 > 2:
                        n_tag.append(nion - i0 - 1)
                        max_ion_index[ion] = max(seq_ion_index[ion])
                else:
                    n_tag.append(len(seq_ion_index[ion]))
                    if len(seq_ion_index[ion]) == 1:
                        max_ion_index[ion] = seq_ion_index[ion][0]

            # if number of ions in a sequence tag is gt 3 or more than
            # two sequence tags having number of ions equaling to 3
            if max(n_tag) >= 4 or n_tag.count(3) >= 2:
                # delta mass
                dms = [(pep.mass - pmz_neu * c, c) for c in range(2, 5)]
                # whether Terminus is tended to be modified
                matches.append(
                    match(seq=seq,
                          mods=mods,
                          charge=psm.charge,
                          num_ions=n,
                          num_seqtags=n_tag,
                          max_tag=max(n_tag),
                          ion_index=max_ion_index,
                          delta_mass=dms,
                          nterm_left=max_ion_index["b"] == 0,
                          cterm_left=max_ion_index["y"] == 0))

        return matches
Пример #3
0
    def _correct_psm(self, match):
        """ Correct PSM. """
        # sparated match to consider subsequence only
        sepm = self._parse_matches(match)
        # get combinations and artifacts
        combs, combs_add, artifs, rp, mod_dict = self._restrict_refs(
            sepm.seq_proc,
            sepm.mods_proc,
            nterm=match.nterm_left,
            cterm=match.cterm_left)
        s = "".join(rp)
        # get the combinations of error loss
        candidates, unique_peps = [], set()
        for dm, c in match.delta_mass:
            if abs(dm) <= self.tol:
                candidates.append(Peptide(match.seq, c, match.mods))
                continue

            # all possible corrections
            corrects = self._correct_mass(s, dm, combs, combs_add, artifs)
            # correction for removing residues
            for corr in corrects:
                seq_c = "".join(mod_dict[r]["res"] if r in mod_dict else r
                                for r in s)
                # reset modifications
                mods_c = []
                for i, r in enumerate(s):
                    if r in mod_dict:
                        mods_c.append(
                            ModSite(mod_dict[r]["mass"], i + 1,
                                    mod_dict[r]["mod"]))
                # set up modification sites from corrections.
                if corr.mods is not None:
                    mods_c += corr.mods

                seq, mods = self._reconstruct_peptide(sepm, seq_c, mods_c)
                # if the peptide has been identified, ignore it.
                pep = self._combine_mods(seq, mods)
                if pep not in unique_peps:
                    candidates.append(Peptide(seq, c, mods))
                    unique_peps.add(pep)

        return candidates
Пример #4
0
 def _annotate_by_synthetic_peptides(self, psm, syn_peptides):
     """ Annotate the spectrum by corresponding synthetic peptide """
     mz = psm.spectrum.prec_mz
     pep_candidates = []
     for _seq, _mods in syn_peptides:
         for c in range(2, 5):
             pep = Peptide(_seq, c, _mods)
             _mz = pep.mass / c + constants.FIXED_MASSES["H"]
             if abs(_mz - mz) <= self.tol:
                 pep_candidates.append(pep)
     return pep_candidates
Пример #5
0
    def _annotate_by_synthetic_peptides(self, psm, syn_peptides):
        """ Annotate the spectrum by corresponding synthetic peptide """
        mz = psm.spectrum.prec_mz
        pep_candidates = []
        for seq, mods in syn_peptides:
            for c in range(2, 5):
                pep = Peptide(seq, c, mods)
                mz_ = pep.mass / c + constants.FIXED_MASSES["H"]
                if abs(mz_ - mz) <= self.tol:
                    pep_candidates.append(pep)

            # consider artificial modifications too
            pep_candidates += self._add_mods(seq, mods, mz)
            pep_candidates += self._add_residue(seq, mods, mz)

        return pep_candidates
Пример #6
0
    def _add_mods(self, seq, mods, mz):
        """ Add artificial modifications to sequence. """
        candidates = []
        # calculate mass
        pmass = _peptide_mass(seq, mods)
        # get possible modified sites
        unmod_res = [(i + 1, r) for i, r in enumerate(seq)
                     if not any(mod.site == i + 1 for mod in mods)]
        # consider terminal modifications too
        if not any(m.site == "nterm" for m in mods):
            unmod_res.insert(0, (0, "nterm"))
        if not any(m.site == "cterm" for m in mods):
            unmod_res.append((len(seq), "cterm"))
        unmod_res = [r for r in unmod_res if r[1] in self.artifacts]

        # re-organize the artifacts
        artifacts, masses = [], []
        for j, r in unmod_res:
            artifacts.extend([(j, mod, m) for mod, m in self.artifacts[r]])
            masses.extend([m for _, m in self.artifacts[r]])

        n, n1 = len(seq), len(masses)

        # artifact combinations
        artifacts += list(itertools.combinations(artifacts, 2))
        masses += [m1 + m2 for m1, m2 in itertools.combinations(masses, 2)]
        masses = np.array(masses)

        # iterate through all possibilities
        for c in range(2, 5):
            mass = (mz - constants.FIXED_MASSES["H"]) * c
            dm = mass - pmass
            # do all possible combinations
            ix, = np.where(np.absolute(dm - masses) <= self.tol)
            for i in ix:
                mod_x = []
                ax = [artifacts[i]] if i < n1 else artifacts[i]
                for j, name, m in ax:
                    site = "nterm" if j == 0 else "cterm" if j == n - 1 else j
                    mod_x.append(ModSite(m, site, name))
                candidates.append(Peptide(seq, c, mod_x + mods))

        return candidates
Пример #7
0
    def _add_residue(self, seq, mods, mz):
        """ Add residues to sequence. """
        pmass = _peptide_mass(seq, mods)
        mz_neutral = mz - constants.FIXED_MASSES["H"]
        # re-set the sequence by replacing the modified residue by number
        pre_mods, seq_x, mods_bk = [], seq, {}
        for i, mod in enumerate(mods):
            if isinstance(mod.site, str):
                pre_mods.append(mod)
            else:
                j = mod.site - 1
                seq_x = f"{seq_x[:j]}{i}{seq_x[j+1:]}"
                mods_bk[f"{i}"] = {"n": mod.mod, "r": seq[j], "m": mod.mass}

        # added masses
        candidate_residues = list(set(seq))
        masses = [constants.AA_MASSES[a].mono for a in candidate_residues]
        candidate_idx = list(range(len(candidate_residues)))
        # combination of 2 residues for insertion
        masses += [m1 + m2 for m1, m2 in itertools.product(masses, masses)]
        candidate_idx += list(itertools.product(candidate_idx, candidate_idx))

        # convert the list to masses
        masses = np.array(masses)

        # partition the sequence in all possibilities
        n = len(seq_x)
        all_seqs_l1 = [tuple([seq_x[:i], seq_x[i:]]) for i in range(n)]
        all_seqs_l2 = [
            tuple([
                seq_x[i:j] for i, j in zip([0] + list(ix),
                                           list(ix) + [None])
            ]) for ix in itertools.combinations_with_replacement(range(n), 2)
        ]

        # insert residues into the sequence
        new_seqs = []
        for c in range(2, 5):
            mass = mz_neutral * c
            dm = mass - pmass
            # do all possible combinations
            match_ix, = np.where(np.absolute(dm - masses) <= self.tol)
            for i in match_ix:
                ix = candidate_idx[i]
                if isinstance(ix, int):
                    new_seqs += [(c, "".join([s1, candidate_residues[ix], s2]))
                                 for s1, s2 in all_seqs_l1]
                else:
                    new_seqs += [(c, "".join([
                        s1, candidate_residues[ix[0]], s2,
                        candidate_residues[ix[1]], s3
                    ])) for s1, s2, s3 in all_seqs_l2]

        # parse them back to sequence and modifications
        candidates = []
        for c, pk in new_seqs:
            mods_new = []
            for i, val in mods_bk.items():
                j = pk.index(i)
                mods_new.append(ModSite(val["m"], j + 1, val["n"]))
                pk = pk.replace(i, val["r"])
            candidates.append(Peptide(pk, c, pre_mods + mods_new))
        return candidates