def _correct_psm(self, match): """ Correct PSM. """ # sparated match to consider subsequence only sepm = self._parse_matches(match) # get combinations and artifacts combs, combs_add, artifs, rp = self._restrict_refs( sepm.seq_proc, sepm.mods_proc, nterm=match.nterm_left, cterm=match.cterm_left) s = "".join(rp) # get the combinations of error loss candidates, unique_peps = [], set() for dm, c in match.delta_mass: if abs(dm) <= self.tol: candidates.append(Peptide(match.seq, c, match.mods)) continue # all possible corrections corrects = self._correct_mass(s, dm, combs, combs_add, artifs) for corr in corrects: mods_c = [] if corr.ist_idx is None: seq_c = "".join(rp[corr.rt_idx]).upper() # reset modifications for mod in sepm.mods_proc: if mod.site - 1 in corr.rt_idx: j = int(corr.rt_idx.index(mod.site - 1)) mods_c.append(mod._replace(site=j + 1)) # set up modification sites from corrections. if corr.mods is not None: mods_c += corr.mods else: iseq = np.array(corr.ist_seq, dtype=str) seq_c = "".join( np.insert(rp[corr.rt_idx], corr.ist_idx, iseq)) # reset modifications for mod in sepm.mods_proc: if mod.site - 1 in corr.rt_idx: j = corr.rt_idx.index(mod.site - 1) + 1 # if residue insert in current subsequence, # reset modification site again. j2 = sum(i < j for i in corr.ist_idx) mods_c.append(mod._replace(site=int(j + j2))) seq, mods = self._reconstruct_peptide(sepm, seq_c, mods_c) # if the peptide has been identified, ignore it. pep = self._combine_mods(seq, mods) if pep not in unique_peps: candidates.append(Peptide(seq, c, mods)) unique_peps.add(pep) return candidates
def _match_syn_peptide(psm, syn_peptides, match=SynMatch): """ Match synthetic peptide forcely. """ pmz_neu = psm.spectrum.prec_mz - constants.FIXED_MASSES["H"] # assign mass spectra using synthetic peptides matches = [] for seq, mods in syn_peptides: # peptide project pep = Peptide(seq, 2, mods) mch = PSM(psm.data_id, psm.spec_id, pep, psm.spectrum) annotates, _ = mch.denoise_spectrum() # get annotations n, seq_ion_index = 0, collections.defaultdict(list) for ion, (_, ion_index) in annotates.items(): if ion[0] in "yb" and "-" not in ion and ion.endswith("[+]"): n += 1 seq_ion_index[ion[0]].append(ion_index) if n == 0: continue # maximum length of seq tags and the maximum ion index n_tag, max_ion_index = [], {"y": 0, "b": 0} for ion in seq_ion_index.keys(): nion = len(seq_ion_index[ion]) if nion > 1: index_diff, i0 = np.diff(sorted(seq_ion_index[ion])), -1 tx, = np.where(index_diff > 1) for i in tx: n_tag.append(i - i0) if i - i0 > 1: max_ion_index[ion] = seq_ion_index[ion][i + 1] i0 = i # the end of the array if nion - i0 > 2: n_tag.append(nion - i0 - 1) max_ion_index[ion] = max(seq_ion_index[ion]) else: n_tag.append(len(seq_ion_index[ion])) if len(seq_ion_index[ion]) == 1: max_ion_index[ion] = seq_ion_index[ion][0] # if number of ions in a sequence tag is gt 3 or more than # two sequence tags having number of ions equaling to 3 if max(n_tag) >= 4 or n_tag.count(3) >= 2: # delta mass dms = [(pep.mass - pmz_neu * c, c) for c in range(2, 5)] # whether Terminus is tended to be modified matches.append( match(seq=seq, mods=mods, charge=psm.charge, num_ions=n, num_seqtags=n_tag, max_tag=max(n_tag), ion_index=max_ion_index, delta_mass=dms, nterm_left=max_ion_index["b"] == 0, cterm_left=max_ion_index["y"] == 0)) return matches
def _correct_psm(self, match): """ Correct PSM. """ # sparated match to consider subsequence only sepm = self._parse_matches(match) # get combinations and artifacts combs, combs_add, artifs, rp, mod_dict = self._restrict_refs( sepm.seq_proc, sepm.mods_proc, nterm=match.nterm_left, cterm=match.cterm_left) s = "".join(rp) # get the combinations of error loss candidates, unique_peps = [], set() for dm, c in match.delta_mass: if abs(dm) <= self.tol: candidates.append(Peptide(match.seq, c, match.mods)) continue # all possible corrections corrects = self._correct_mass(s, dm, combs, combs_add, artifs) # correction for removing residues for corr in corrects: seq_c = "".join(mod_dict[r]["res"] if r in mod_dict else r for r in s) # reset modifications mods_c = [] for i, r in enumerate(s): if r in mod_dict: mods_c.append( ModSite(mod_dict[r]["mass"], i + 1, mod_dict[r]["mod"])) # set up modification sites from corrections. if corr.mods is not None: mods_c += corr.mods seq, mods = self._reconstruct_peptide(sepm, seq_c, mods_c) # if the peptide has been identified, ignore it. pep = self._combine_mods(seq, mods) if pep not in unique_peps: candidates.append(Peptide(seq, c, mods)) unique_peps.add(pep) return candidates
def _annotate_by_synthetic_peptides(self, psm, syn_peptides): """ Annotate the spectrum by corresponding synthetic peptide """ mz = psm.spectrum.prec_mz pep_candidates = [] for _seq, _mods in syn_peptides: for c in range(2, 5): pep = Peptide(_seq, c, _mods) _mz = pep.mass / c + constants.FIXED_MASSES["H"] if abs(_mz - mz) <= self.tol: pep_candidates.append(pep) return pep_candidates
def _annotate_by_synthetic_peptides(self, psm, syn_peptides): """ Annotate the spectrum by corresponding synthetic peptide """ mz = psm.spectrum.prec_mz pep_candidates = [] for seq, mods in syn_peptides: for c in range(2, 5): pep = Peptide(seq, c, mods) mz_ = pep.mass / c + constants.FIXED_MASSES["H"] if abs(mz_ - mz) <= self.tol: pep_candidates.append(pep) # consider artificial modifications too pep_candidates += self._add_mods(seq, mods, mz) pep_candidates += self._add_residue(seq, mods, mz) return pep_candidates
def _add_mods(self, seq, mods, mz): """ Add artificial modifications to sequence. """ candidates = [] # calculate mass pmass = _peptide_mass(seq, mods) # get possible modified sites unmod_res = [(i + 1, r) for i, r in enumerate(seq) if not any(mod.site == i + 1 for mod in mods)] # consider terminal modifications too if not any(m.site == "nterm" for m in mods): unmod_res.insert(0, (0, "nterm")) if not any(m.site == "cterm" for m in mods): unmod_res.append((len(seq), "cterm")) unmod_res = [r for r in unmod_res if r[1] in self.artifacts] # re-organize the artifacts artifacts, masses = [], [] for j, r in unmod_res: artifacts.extend([(j, mod, m) for mod, m in self.artifacts[r]]) masses.extend([m for _, m in self.artifacts[r]]) n, n1 = len(seq), len(masses) # artifact combinations artifacts += list(itertools.combinations(artifacts, 2)) masses += [m1 + m2 for m1, m2 in itertools.combinations(masses, 2)] masses = np.array(masses) # iterate through all possibilities for c in range(2, 5): mass = (mz - constants.FIXED_MASSES["H"]) * c dm = mass - pmass # do all possible combinations ix, = np.where(np.absolute(dm - masses) <= self.tol) for i in ix: mod_x = [] ax = [artifacts[i]] if i < n1 else artifacts[i] for j, name, m in ax: site = "nterm" if j == 0 else "cterm" if j == n - 1 else j mod_x.append(ModSite(m, site, name)) candidates.append(Peptide(seq, c, mod_x + mods)) return candidates
def _add_residue(self, seq, mods, mz): """ Add residues to sequence. """ pmass = _peptide_mass(seq, mods) mz_neutral = mz - constants.FIXED_MASSES["H"] # re-set the sequence by replacing the modified residue by number pre_mods, seq_x, mods_bk = [], seq, {} for i, mod in enumerate(mods): if isinstance(mod.site, str): pre_mods.append(mod) else: j = mod.site - 1 seq_x = f"{seq_x[:j]}{i}{seq_x[j+1:]}" mods_bk[f"{i}"] = {"n": mod.mod, "r": seq[j], "m": mod.mass} # added masses candidate_residues = list(set(seq)) masses = [constants.AA_MASSES[a].mono for a in candidate_residues] candidate_idx = list(range(len(candidate_residues))) # combination of 2 residues for insertion masses += [m1 + m2 for m1, m2 in itertools.product(masses, masses)] candidate_idx += list(itertools.product(candidate_idx, candidate_idx)) # convert the list to masses masses = np.array(masses) # partition the sequence in all possibilities n = len(seq_x) all_seqs_l1 = [tuple([seq_x[:i], seq_x[i:]]) for i in range(n)] all_seqs_l2 = [ tuple([ seq_x[i:j] for i, j in zip([0] + list(ix), list(ix) + [None]) ]) for ix in itertools.combinations_with_replacement(range(n), 2) ] # insert residues into the sequence new_seqs = [] for c in range(2, 5): mass = mz_neutral * c dm = mass - pmass # do all possible combinations match_ix, = np.where(np.absolute(dm - masses) <= self.tol) for i in match_ix: ix = candidate_idx[i] if isinstance(ix, int): new_seqs += [(c, "".join([s1, candidate_residues[ix], s2])) for s1, s2 in all_seqs_l1] else: new_seqs += [(c, "".join([ s1, candidate_residues[ix[0]], s2, candidate_residues[ix[1]], s3 ])) for s1, s2, s3 in all_seqs_l2] # parse them back to sequence and modifications candidates = [] for c, pk in new_seqs: mods_new = [] for i, val in mods_bk.items(): j = pk.index(i) mods_new.append(ModSite(val["m"], j + 1, val["n"])) pk = pk.replace(i, val["r"]) candidates.append(Peptide(pk, c, pre_mods + mods_new)) return candidates