Пример #1
0
 def core(self, seq: str) -> None:
     self.core_analysis_monoisotopic = utils.RobustProteinAnalysis(
         seq, monoisotopic=True)
     self.core_analysis = utils.RobustProteinAnalysis(seq,
                                                      monoisotopic=False)
     self._core = seq
     self._calculate_mw()
Пример #2
0
 def core(self, seq: str) -> None:
     assert isinstance(seq, str)
     assert seq
     self._core = seq
     self._weight = -1.
     self._monoisotopic_weight = -1.
     self.core_analysis_monoisotopic = utils.RobustProteinAnalysis(seq, monoisotopic=True)
     self.core_analysis = utils.RobustProteinAnalysis(seq, monoisotopic=False)
Пример #3
0
    def test_molecular_weight_ignore(self):
        """ Test RobustProteinAnalysis.molecular_weight() calculates
            correct weight when ignoring invalids"""
        rpa = utils.RobustProteinAnalysis("MAGICXHAT")
        self.assertEqual(802.9621, rpa.molecular_weight())  # default is True

        rpa = utils.RobustProteinAnalysis("MAGICXHAT", ignore_invalid=True)
        self.assertEqual(802.9621, rpa.molecular_weight())
Пример #4
0
    def test_init(self):
        """Test RobustProteinAnalysis initialisation"""
        rpa = utils.RobustProteinAnalysis("MAGICHAT", ignore_invalid=True)
        self.assertIsInstance(rpa, utils.RobustProteinAnalysis)

        rpa = utils.RobustProteinAnalysis("MAGICHAT", ignore_invalid=False)
        self.assertIsInstance(rpa, utils.RobustProteinAnalysis)

        for bad_invalid in ["none", None, 3, []]:
            with self.assertRaises(TypeError):
                utils.RobustProteinAnalysis("MAGICHAT",
                                            ignore_invalid=bad_invalid)
Пример #5
0
 def cut_weight(self) -> float:
     """ Determines the weight of the core peptide without tail """
     if not self.core:
         raise ValueError("Cannot calculate cut weights without a core")
     if not self.c_cut:
         return self.molecular_weight
     return self._calculate_weight(utils.RobustProteinAnalysis(self.core[:-len(self.c_cut)], monoisotopic=False))
Пример #6
0
    def _calculate_number_of_bridges(self)-> int:
        """
        Predict the lassopeptide number of disulfide bridges
        """

        aas = utils.RobustProteinAnalysis(self.core, monoisotopic=True).count_amino_acids()
        if aas['C'] >= 4:
            return 2
        elif aas['C'] >= 2:
            return 1
        return 0
Пример #7
0
    def number_bridges(self):
        """
        Predict the lassopeptide number of disulfide bridges
        """

        aas = utils.RobustProteinAnalysis(
            self.core, monoisotopic=True).count_amino_acids()
        if aas['C'] >= 4:
            self._num_bridges = 2
        elif aas['C'] >= 2:
            self._num_bridges = 1
        return self._num_bridges
Пример #8
0
 def test_molecular_weight_average(self):
     """ Test RobustProteinAnalysis.molecular_weight() calculates
         correct weight when not ignoring invalids
     """
     rpa = utils.RobustProteinAnalysis("MAGICXHAT", ignore_invalid=False)
     self.assertEqual(912.9621, rpa.molecular_weight())
Пример #9
0
 def test_uppercase(self):
     """Test RobustProteinAnalysis converts passed sequence to upper case"""
     rpa = utils.RobustProteinAnalysis("Magichat")
     assert rpa.original_sequence == "MAGICHAT"
     assert rpa.sequence == "MAGICHAT"
Пример #10
0
def acquire_rodeo_heuristics(record: secmet.Record, query: secmet.CDSFeature,
                             leader: str, core: str,
                             domains: List[str]) -> Tuple[int, List[float]]:
    """ Calculate heuristic scores for RODEO

        Arguments:
            record: the record instance to analyse
            query: the feature being checked
            leader: the sequence of the peptide leader
            core: the sequence of the peptide core
            domains: the domains found within CDS features of the cluster

        Returns:
            a tuple of
                the RODEO score, and
                a list of floats for use in the RODEO SVM
    """
    tabs = []  # type: List[float]
    score = 0
    precursor = leader + core
    # Leader peptide contains FxLD motif
    if re.search('F.LD', leader):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core residue position of Sx4C motif
    match = re.search('S....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Tx4C motif
    match = re.search('T....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Sx5C motif
    match = re.search('S.....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core residue position of Tx5C motif
    match = re.search('T.....C', core)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Precursor is within 500 nt?
    hmmer_profiles = ['LANC_like', 'Lant_dehyd_C']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains LanB dehydratase domain (PF04738)
    if "Lant_dehyd_C" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains Lan C cyclase domain (PF05147)
    if "LANC_like" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster LACKS LanB dehydratase domain (PF04738)
    if "Lant_dehyd_C" not in domains:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster LACKS Lan C cyclase domain (PF05147)
    if "LANC_like" not in domains:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains LanB dehydratase elimination C-terminal domain (PF14028)
    if "PF14028" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains S8 peptidase subtilase (PF00082)
    if "Peptidase_S8" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains C39 peptidase (PF03412)
    if "Peptidase_C39" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains ABC transporter (PF00005)
    if "PF00005" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains YcaO-like protein (PF02624)
    if "YcaO" in domains:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains ThiF-like protein (PF00899)
    if "ThiF" in domains:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains PF02052 (Gallidermin)
    if set(domains).intersection(
        {"Gallidermin", "mature_a", "mature_b", "matura_ab"}):
        tabs.append(1)
    else:
        tabs.append(0)
    # Cluster contains PF8130
    if "Antimicr18" in domains:
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide mass < 4000 Da
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=True)
    if precursor_analysis.molecular_weight() < 4000:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide mass < 2000 Da
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=True)
    if core_analysis.molecular_weight() < 2000:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide pHMMs below:
    precursor_hit = False
    # Precursor peptide hits gallidermin superfamily (cl03420) HMM
    if cds_has_domains(query, {"TIGR03731", "Gallidermin"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits lantibio_gallid (TIGR03731) HMM
    if cds_has_domains(query, {"TIGR03731"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits lanti_SCO0268 superfamily (cl22812) HMM
    if cds_has_domains(query, {"TIGR04451", "strep_PEQAXS"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits LD_lanti_pre (TIGR04363) HMM
    if cds_has_domains(query, {"LD_lanti_pre"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits Antimicrobial18 (cl06940) HMM
    if cds_has_domains(query, {"Antimicr18"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # Precursor peptide hits gallidermin (PF02052) HMM
    if cds_has_domains(query,
                       {"Gallidermin", "mature_a", "mature_ab", "mature_b"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)
    # precursor peptide hits Antimicrobial18 (PF08130) HMM
    if cds_has_domains(query, {"Antimicr18"}):
        precursor_hit = True
        tabs.append(1)
    else:
        tabs.append(0)

    if precursor_hit:
        score += 3

    # Precursor peptide mass (unmodified)
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=False)
    tabs.append(float(precursor_analysis.molecular_weight()))

    # Unmodified leader peptide mass
    leader_analysis = utils.RobustProteinAnalysis(leader,
                                                  monoisotopic=True,
                                                  ignore_invalid=False)
    tabs.append(float(leader_analysis.molecular_weight()))

    # Unmodified core peptide mass
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Length of leader peptide
    tabs.append(len(leader))
    # Length of core peptide
    tabs.append(len(core))
    # Length of precursor peptide
    tabs.append(len(precursor))
    # Ratio of length of leader peptide / length of core peptide
    tabs.append(float(len(leader) / float(len(core))))
    # Core peptide ≥ 35 residues
    if len(core) >= 35:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide contains CC motif (not in last 3 residues)
    if 'CC' in core[:-3]:
        score -= 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader peptide has > 4 negatively charge motifs
    if sum([leader.count(aa) for aa in "DE"]) > 4:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader peptide has net negative charge
    charge_dict = {"E": -1, "D": -1, "K": 1, "R": 1}
    if sum([charge_dict[aa] for aa in leader if aa in charge_dict]) < 0:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader residue position of FxLD motif
    match = re.search('F.LD', leader)
    if match:
        tabs.append(match.span()[0])
    else:
        tabs.append(0)
    # Core peptide contains C-terminal CC (within last 3 residues)
    if 'CC' in core[-3:]:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core peptide contains DGCGxTC / SFNS / SxxLC / CTxGC / TPGC / SFNSxC motifs
    motifs = (('DGCG.TC', 2), ('SFNS', 2), ('S..LC', 2), ('CT.GC', 1),
              ('TPGC', 1), ('SFNS.C', 1))
    for motif, motif_score in motifs:
        if re.search(motif, core):
            score += motif_score
            tabs.append(1)
        else:
            tabs.append(0)
    # Core peptide contains < 2 or < 3 Cys
    if core.count("C") < 2:
        score -= 6
        tabs += [1, 1]
    elif core.count("C") < 3:
        score -= 3
        tabs += [1, 0]
    else:
        tabs += [0, 0]
    # No Cys/Ser/Thr in core peptide
    for amino, penalty in [("C", -10), ("S", -4), ("T", -4)]:
        if amino not in core:
            score += penalty
            tabs.append(1)
        else:
            tabs.append(0)
    # Lanthionine regex maximum ring number > 4
    numrings, profile = lanscout(core)
    if numrings > 4:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Lanthionine regex maximum ring number < 3
    if numrings < 3:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Lanthionine regex 4-membered ring/5-membered ring/6-membered ring/7-membered ring/8-membered ring
    scores = [2, 2, 2, 2, 1]
    scorepos = 0
    for ringsize in profile[:2]:
        if ringsize not in [0, 1, 2]:
            score += scores[scorepos]
            tabs.append(1)
        else:
            tabs.append(0)
        scorepos += 1
    for ringsize in profile[2:]:
        if ringsize != 0:
            score += scores[scorepos]
            tabs.append(1)
        else:
            tabs.append(0)
        scorepos += 1
    return score, tabs
Пример #11
0
def acquire_rodeo_heuristics(
        cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str,
        core: str, domains: Dict[str,
                                 int]) -> Tuple[int, List[float], List[int]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []
    score = 0
    precursor = leader + core
    # Calcd. precursor peptide mass (Da)
    precursor_analysis = utils.RobustProteinAnalysis(precursor,
                                                     monoisotopic=True,
                                                     ignore_invalid=False)
    tabs.append(float(precursor_analysis.molecular_weight()))
    # Calcd. leader peptide mass (Da)
    leader_analysis = utils.RobustProteinAnalysis(leader,
                                                  monoisotopic=True,
                                                  ignore_invalid=False)
    tabs.append(float(leader_analysis.molecular_weight()))
    # Calcd. core peptide mass (Da)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))
    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF04055']
    distance = utils.distance_to_pfam(cluster.parent_record, query,
                                      hmmer_profiles)
    tabs.append(distance)
    # rSAM within 500 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM within 150 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # rSAM further than 1000 nt?
    if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \
       utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60
    if "C" not in precursor:
        score -= 2
        tabs += [0, 1]
    elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60:
        score += 2
        tabs += [1, 0]
    else:
        score -= 2
        tabs += [0, 1]
    # Three or more Cys; Less than 3 Cys
    if precursor.count("C") >= 3:
        score += 4
        tabs += [1, 0]
    else:
        score -= 4
        tabs += [0, 1]
    # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC
    motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1),
              ('CC', -2), ('CCC', -2))
    for motif in motifs:
        if re.search(motif[0], core):
            score += motif[1]
            tabs.append(1)
        else:
            tabs.append(0)
    # No Cys in last 1/4th?
    quarter_length = -len(precursor) // 4
    if "C" not in precursor[quarter_length:]:
        score += 1
        tabs.append(1)
    else:
        score -= 1
        tabs.append(0)
    # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor
    two_thirds = 2 * len(precursor) // 3
    if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count(
            "C") == 1:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SboA hmm
    if cds_has_domains(query, {"Subtilosin_A"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SkfA hmm
    if cds_has_domains(query, {"TIGR04404"}):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide matches SCIFF hmm
    if cds_has_domains(query, {"TIGR03973"}):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has PqqD/RRE (PF05402)
    if "PF05402" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has SPASM domain (PF13186)
    if "PF13186" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # PF04055 (rSAM) domain start > 80
    runresults = subprocessing.run_hmmsearch(
        path.get_full_path(__file__, "data", "PF04055.hmm"),
        fasta.get_fasta_from_features(cluster.cds_children))
    max_start = 0
    hitstarts = []
    hitends = []
    for runresult in runresults:
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.bitscore > 40:
                hitstarts.append(hsp.hit_start)
                max_start = max(hsp.hit_start, max_start)
                hitends.append(hsp.hit_end)
    if hitstarts and max_start > 80:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has peptidase
    peptidase_domains = [
        "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41"
    ]
    no_peptidase = True
    for pepdom in peptidase_domains:
        if pepdom in domains:
            score += 1
            tabs.append(1)
            no_peptidase = False
        else:
            tabs.append(0)
    # cluster has transporter
    transport_domains = ["PF00005", "PF00664"]
    for transpdom in transport_domains:
        if transpdom in domains:
            score += 1
            tabs.append(1)
        else:
            tabs.append(0)
    # cluster has response regulator (PF00072)
    if "PF00072" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has major facilitator (PF07690)
    if "PF07690" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has ATPase (PF13304)
    if "PF13304" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has Fer4_12 (PF13353)
    if "PF13353" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has rSAM (PF04055)
    if "PF04055" in domains or "TIGR03975" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # cluster has no recognized peptidase
    if no_peptidase:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as
    # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus
    # the binary opposite is also included as the next field
    last_motif_c = 0
    index = -1
    for aa in reversed(precursor):
        if aa == "C" and "C" in precursor[index - 6:index]:
            last_motif_c = index + 1
        index -= 1
    if 0.35 <= last_motif_c / len(precursor) <= 0.65:
        score += 3
        tabs += [0, 1]
    else:
        score -= 2
        tabs += [1, 0]
    # SS profile count > 1
    # is there more than one Cx..C structure in the sequence
    cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER)
    rex4 = re.compile(cysrex)
    if len(rex4.findall(core)) > 1:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    return score, tabs, hitends
Пример #12
0
def generate_rodeo_svm_csv(leader: str, core: str, previously_gathered_tabs: List[float]) -> List[float]:
    """Generates all the items for one candidate precursor peptide"""
    precursor = leader + core
    columns: List[float] = []
    # Precursor Index
    columns.append(1)
    # classification
    columns.append(0)
    columns.extend(previously_gathered_tabs)
    stats = ThioStatistics(core)
    # Number repeating blocks of heterocyclizable residues in core
    columns.append(stats.block_repeats)
    # Number of core repeating Cys
    columns.append(stats.c_repeats)
    # Number of core repeating Ser
    columns.append(stats.s_repeats)
    # Number of core repeating Thr
    columns.append(stats.t_repeats)
    # Number of blocks of heterocyclizable residues in core
    columns.append(stats.heteroblocks)
    # Average core heterocycle block length
    columns.append(stats.average_heteroblock_length)
    # Precursor peptide mass (unmodified)
    columns.append(utils.RobustProteinAnalysis(leader+core, monoisotopic=True).molecular_weight())
    # Unmodified leader peptide mass
    columns.append(utils.RobustProteinAnalysis(leader, monoisotopic=True).molecular_weight())
    # Unmodified core peptide mass
    columns.append(utils.RobustProteinAnalysis(core, monoisotopic=True).molecular_weight())
    # Length of Precursor
    columns.append(len(precursor))
    # Length of Leader
    columns.append(len(leader))
    # Length of Core
    columns.append(len(core))
    # Ratio of length of leader / length of core
    columns.append(len(core) / len(leader))
    # Ratio of heterocyclizable residues / length of core
    columns.append(sum([core.count(aa) for aa in "CST"]) / len(core))
    # Number in leader of each amino acid
    columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in leader
    columns.append(sum([leader.count(aa) for aa in "FWY"]))
    # Neg charged in leader
    columns.append(sum([leader.count(aa) for aa in "DE"]))
    # Pos charged in leader
    columns.append(sum([leader.count(aa) for aa in "RK"]))
    # Charged in leader
    columns.append(sum([leader.count(aa) for aa in "RKDE"]))
    # Aliphatic in leader
    columns.append(sum([leader.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in leader
    columns.append(sum([leader.count(aa) for aa in "ST"]))
    # Counts of AAs in core
    columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in core
    columns.append(sum([core.count(aa) for aa in "FWY"]))
    # Neg charged in core
    columns.append(sum([core.count(aa) for aa in "DE"]))
    # Pos charged in core
    columns.append(sum([core.count(aa) for aa in "RK"]))
    # Charged in core
    columns.append(sum([core.count(aa) for aa in "RKDE"]))
    # Aliphatic in core
    columns.append(sum([core.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in core
    columns.append(sum([core.count(aa) for aa in "ST"]))
    # Counts of AAs in entire precursor (leader+core)
    columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"]
    # Aromatics in precursor
    columns.append(sum([precursor.count(aa) for aa in "FWY"]))
    # Neg charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "DE"]))
    # Pos charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RK"]))
    # Charged in precursor
    columns.append(sum([precursor.count(aa) for aa in "RKDE"]))
    # Aliphatic in precursor
    columns.append(sum([precursor.count(aa) for aa in "GAVLMI"]))
    # Hydroxyl in precursor
    columns.append(sum([precursor.count(aa) for aa in "ST"]))
    return columns
Пример #13
0
def acquire_rodeo_heuristics(leader: str, core: str,  # pylint: disable=too-many-branches,too-many-statements
                             domains: Set[str]) -> Tuple[int, List[float]]:
    """ Calculate heuristic scores for RODEO """
    tabs: List[float] = []
    score = 0
    # Contains TOMM YcaO (PF02624)
    if "YcaO" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains LanB N-terminal domain (PF04738)
    if "Lant_dehyd_N" in domains or "PF04738" in domains or "tsrC" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains LanB C-terminal domain (PF14028)
    if "Lant_dehyd_C" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains TOMM dehydrogenase (PF00881)
    if "PF00881" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains rSAM methyltransferase (PF04055)
    if "PF04055" in domains:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains P450 (PF00067)
    if "p450" in domains:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Contains ABC transporter or abhydrolase
    abc_transp_abhydrolases = ["PF00005", "PF01061", "PF12698", "PF12697", "PF00561"]
    for dom in abc_transp_abhydrolases:
        if dom in domains:
            score += 1
            tabs.append(1)
        else:
            tabs.append(0)
    # CSS/CTT, SS/SSS/SSS, CC/CCC/CCCC, TT/TT/TTTT motifs
    motifs = (('CS{2,}', 1), ('CT{2,}', 1),
              ('S{2,}', 1), ('S{3,}', 1), ('S{4,}', 2),
              ('C{2,}', 1), ('C{3,}', 1), ('C{4,}', 2),
              ('T{2,}', 1), ('T{3,}', 1), ('T{4,}', 2))
    for motif in motifs:
        if re.search(motif[0], core):
            score += motif[1]
            tabs.append(1)
        else:
            tabs.append(0)
    # No Cys/Ser/Thr core residues
    for aa in "CST":
        if aa not in core:
            score -= 2
            tabs.append(1)
        else:
            tabs.append(0)
    # Mass of core peptide (unmodified) < 2100
    core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True)
    if core_analysis.molecular_weight() < 2100:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Sum of repeating Cys/Ser/Thr > 4
    stats = ThioStatistics(core)
    if stats.cst_repeats > 4:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Avg heterocycle block length > 3
    if stats.average_heteroblock_length > 3:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader net charge < 5
    charge_dict = {"E": -1, "D": -1, "K": 1, "R": 1}
    leader_charge = sum([charge_dict[aa] for aa in leader if aa in charge_dict])
    if leader_charge < 5:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader net charge > 0
    if leader_charge > 0:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader contains a Cys
    if "C" in leader:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide terminates Cys/Ser/Thr
    if core[-1] in "CST":
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core contains >= 2 positive residues
    if sum([core.count(aa) for aa in "RK"]) >= 2:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Number of heterocyclizable residues to core ratio > 0.4
    if sum([core.count(aa) for aa in "CST"]) / len(core) >= 0.4:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    return score, tabs
Пример #14
0
def acquire_rodeo_heuristics(record: Record, cluster: Cluster,
                             query: CDSFeature, leader: str,
                             core: str) -> Tuple[int, List[Union[float, int]]]:
    """Calculate heuristic scores for RODEO"""
    tabs = []  # type: List[Union[float, int]]
    score = 0
    # Calcd. lasso peptide mass (Da) (with Xs average out)
    core_analysis = utils.RobustProteinAnalysis(core,
                                                monoisotopic=True,
                                                ignore_invalid=False)
    tabs.append(float(core_analysis.molecular_weight()))

    # Distance to any biosynthetic protein (E, B, C)
    hmmer_profiles = ['PF13471', 'PF00733', 'PF05402']
    distance = utils.distance_to_pfam(record, query, hmmer_profiles)
    tabs.append(distance)
    # Within 500 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 500:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Within 150 nucleotides of any biosynthetic protein (E, B, C)	+1
    if distance < 150:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C)	-2
    if distance > 1000:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core region has 2 or 4 Cys residues	+1
    if core.count("C") in [2, 4]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region is longer than core region	+2
    if len(leader) > len(core):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible	+1
    if 'E' in core[6:8] or 'D' in core[7:9]:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains GxxxxxT	+3
    if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader):
        score += 3
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with G	+2
    if core.startswith("G"):
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Peptide and lasso cyclase are on same strand	+1
    if is_on_same_strand_as(cluster, query, 'PF00733'):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader/core region length ratio < 2 and > 0.5	+1
    if 0.5 <= len(leader) / len(core) <= 2:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core starts with Cys and has an even number of Cys	0
    if core.startswith("C") and core.count("C") % 2 == 0:
        score += 0
        tabs.append(1)
    else:
        tabs.append(0)
    # Core contains no Gly	-4
    if "G" not in core:
        score -= 4
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least one aromatic residue	+1
    if set("FWY") & set(core):
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has at least 2 aromatic residues	+2
    if sum([core.count(aa) for aa in list("FWY")]) >= 2:
        score += 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Core has odd number of Cys	-2
    if core.count("C") % 2 != 0:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Trp	-1
    if "W" in leader:
        score -= 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region contains Lys	+1
    if "K" in leader:
        score += 1
        tabs.append(1)
    else:
        tabs.append(0)
    # Leader region has Cys	-2
    if "C" in leader:
        score -= 2
        tabs.append(1)
    else:
        tabs.append(0)
    # Gene cluster does not contain PF13471	-2
    if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \
       utils.distance_to_pfam(record, query, ['PF13471']) > 10000:
        score -= 2
    # Peptide utilizes alternate start codon	-1
    if not str(query.extract(record.seq)).startswith("ATG"):
        score -= 1
    return score, tabs
Пример #15
0
 def molecular_weight(self) -> float:
     """ Determines the weight of the core peptide """
     if not self.core:
         raise ValueError("Cannot calculate weights without a core")
     return self._calculate_weight(
         utils.RobustProteinAnalysis(self.core, monoisotopic=False))