def test_simple_after(self): cds = self.create_cds(60000, 63000, profiles=["right10k"]) self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["notright10k"]) == -1 assert utils.distance_to_pfam(self.record, self.query, ["right10k"]) == 10000
def test_simple_before(self): cds = self.create_cds(29000, 30000, profiles=["left20k"]) self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["notleft20k"]) == -1 assert utils.distance_to_pfam(self.record, self.query, ["left20k"]) == 20000
def test_edge_overlap_before(self): cds = self.create_cds(9000, 10000, profiles=["l.edge"]) self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == -1 cds.location = FeatureLocation(9000, 10001, strand=1) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == 39999 cds.location = FeatureLocation(9000, 10001, strand=-1) assert utils.distance_to_pfam(self.record, self.query, ["l.edge"]) == 39999
def test_outside_before(self): cds = self.create_cds(5000, 9999, profiles=["outside"]) self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["outside"]) == -1
def test_self_hit(self): assert utils.distance_to_pfam(self.record, self.query, ["query_gene_prof"]) == 0
def test_empty_record(self): self.record._cds_features.clear() assert utils.distance_to_pfam(self.record, self.query, []) == -1
def test_with_no_secmet(self): cds = self.create_cds(55000, 60000, profiles=[]) cds.sec_met = SecMetQualifier() self.record.add_cds_feature(cds) assert utils.distance_to_pfam(self.record, self.query, ["test"]) == -1
def acquire_rodeo_heuristics(record: secmet.Record, query: secmet.CDSFeature, leader: str, core: str, domains: List[str]) -> Tuple[int, List[float]]: """ Calculate heuristic scores for RODEO Arguments: record: the record instance to analyse query: the feature being checked leader: the sequence of the peptide leader core: the sequence of the peptide core domains: the domains found within CDS features of the cluster Returns: a tuple of the RODEO score, and a list of floats for use in the RODEO SVM """ tabs = [] # type: List[float] score = 0 precursor = leader + core # Leader peptide contains FxLD motif if re.search('F.LD', leader): score += 2 tabs.append(1) else: tabs.append(0) # Core residue position of Sx4C motif match = re.search('S....C', core) if match: tabs.append(match.span()[0]) else: tabs.append(0) # Core residue position of Tx4C motif match = re.search('T....C', core) if match: tabs.append(match.span()[0]) else: tabs.append(0) # Core residue position of Sx5C motif match = re.search('S.....C', core) if match: tabs.append(match.span()[0]) else: tabs.append(0) # Core residue position of Tx5C motif match = re.search('T.....C', core) if match: tabs.append(match.span()[0]) else: tabs.append(0) # Precursor is within 500 nt? hmmer_profiles = ['LANC_like', 'Lant_dehyd_C'] distance = utils.distance_to_pfam(record, query, hmmer_profiles) if distance < 500: score += 1 tabs.append(1) else: tabs.append(0) # Cluster contains LanB dehydratase domain (PF04738) if "Lant_dehyd_C" in domains: score += 2 tabs.append(1) else: tabs.append(0) # Cluster contains Lan C cyclase domain (PF05147) if "LANC_like" in domains: score += 2 tabs.append(1) else: tabs.append(0) # Cluster LACKS LanB dehydratase domain (PF04738) if "Lant_dehyd_C" not in domains: score -= 2 tabs.append(1) else: tabs.append(0) # Cluster LACKS Lan C cyclase domain (PF05147) if "LANC_like" not in domains: score -= 2 tabs.append(1) else: tabs.append(0) # Cluster contains LanB dehydratase elimination C-terminal domain (PF14028) if "PF14028" in domains: score += 2 tabs.append(1) else: tabs.append(0) # Cluster contains S8 peptidase subtilase (PF00082) if "Peptidase_S8" in domains: score += 1 tabs.append(1) else: tabs.append(0) # Cluster contains C39 peptidase (PF03412) if "Peptidase_C39" in domains: score += 1 tabs.append(1) else: tabs.append(0) # Cluster contains ABC transporter (PF00005) if "PF00005" in domains: score += 1 tabs.append(1) else: tabs.append(0) # Cluster contains YcaO-like protein (PF02624) if "YcaO" in domains: score -= 4 tabs.append(1) else: tabs.append(0) # Cluster contains ThiF-like protein (PF00899) if "ThiF" in domains: score -= 4 tabs.append(1) else: tabs.append(0) # Cluster contains PF02052 (Gallidermin) if set(domains).intersection( {"Gallidermin", "mature_a", "mature_b", "matura_ab"}): tabs.append(1) else: tabs.append(0) # Cluster contains PF8130 if "Antimicr18" in domains: tabs.append(1) else: tabs.append(0) # Precursor peptide mass < 4000 Da precursor_analysis = utils.RobustProteinAnalysis(precursor, monoisotopic=True, ignore_invalid=True) if precursor_analysis.molecular_weight() < 4000: score -= 3 tabs.append(1) else: tabs.append(0) # Core peptide mass < 2000 Da core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=True) if core_analysis.molecular_weight() < 2000: score -= 3 tabs.append(1) else: tabs.append(0) # Precursor peptide pHMMs below: precursor_hit = False # Precursor peptide hits gallidermin superfamily (cl03420) HMM if cds_has_domains(query, {"TIGR03731", "Gallidermin"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # Precursor peptide hits lantibio_gallid (TIGR03731) HMM if cds_has_domains(query, {"TIGR03731"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # Precursor peptide hits lanti_SCO0268 superfamily (cl22812) HMM if cds_has_domains(query, {"TIGR04451", "strep_PEQAXS"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # Precursor peptide hits LD_lanti_pre (TIGR04363) HMM if cds_has_domains(query, {"LD_lanti_pre"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # Precursor peptide hits Antimicrobial18 (cl06940) HMM if cds_has_domains(query, {"Antimicr18"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # Precursor peptide hits gallidermin (PF02052) HMM if cds_has_domains(query, {"Gallidermin", "mature_a", "mature_ab", "mature_b"}): precursor_hit = True tabs.append(1) else: tabs.append(0) # precursor peptide hits Antimicrobial18 (PF08130) HMM if cds_has_domains(query, {"Antimicr18"}): precursor_hit = True tabs.append(1) else: tabs.append(0) if precursor_hit: score += 3 # Precursor peptide mass (unmodified) precursor_analysis = utils.RobustProteinAnalysis(precursor, monoisotopic=True, ignore_invalid=False) tabs.append(float(precursor_analysis.molecular_weight())) # Unmodified leader peptide mass leader_analysis = utils.RobustProteinAnalysis(leader, monoisotopic=True, ignore_invalid=False) tabs.append(float(leader_analysis.molecular_weight())) # Unmodified core peptide mass core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Length of leader peptide tabs.append(len(leader)) # Length of core peptide tabs.append(len(core)) # Length of precursor peptide tabs.append(len(precursor)) # Ratio of length of leader peptide / length of core peptide tabs.append(float(len(leader) / float(len(core)))) # Core peptide ≥ 35 residues if len(core) >= 35: score += 1 tabs.append(1) else: tabs.append(0) # Core peptide contains CC motif (not in last 3 residues) if 'CC' in core[:-3]: score -= 3 tabs.append(1) else: tabs.append(0) # Leader peptide has > 4 negatively charge motifs if sum([leader.count(aa) for aa in "DE"]) > 4: score += 1 tabs.append(1) else: tabs.append(0) # Leader peptide has net negative charge charge_dict = {"E": -1, "D": -1, "K": 1, "R": 1} if sum([charge_dict[aa] for aa in leader if aa in charge_dict]) < 0: score += 1 tabs.append(1) else: tabs.append(0) # Leader residue position of FxLD motif match = re.search('F.LD', leader) if match: tabs.append(match.span()[0]) else: tabs.append(0) # Core peptide contains C-terminal CC (within last 3 residues) if 'CC' in core[-3:]: score += 2 tabs.append(1) else: tabs.append(0) # Core peptide contains DGCGxTC / SFNS / SxxLC / CTxGC / TPGC / SFNSxC motifs motifs = (('DGCG.TC', 2), ('SFNS', 2), ('S..LC', 2), ('CT.GC', 1), ('TPGC', 1), ('SFNS.C', 1)) for motif, motif_score in motifs: if re.search(motif, core): score += motif_score tabs.append(1) else: tabs.append(0) # Core peptide contains < 2 or < 3 Cys if core.count("C") < 2: score -= 6 tabs += [1, 1] elif core.count("C") < 3: score -= 3 tabs += [1, 0] else: tabs += [0, 0] # No Cys/Ser/Thr in core peptide for amino, penalty in [("C", -10), ("S", -4), ("T", -4)]: if amino not in core: score += penalty tabs.append(1) else: tabs.append(0) # Lanthionine regex maximum ring number > 4 numrings, profile = lanscout(core) if numrings > 4: score += 2 tabs.append(1) else: tabs.append(0) # Lanthionine regex maximum ring number < 3 if numrings < 3: score -= 2 tabs.append(1) else: tabs.append(0) # Lanthionine regex 4-membered ring/5-membered ring/6-membered ring/7-membered ring/8-membered ring scores = [2, 2, 2, 2, 1] scorepos = 0 for ringsize in profile[:2]: if ringsize not in [0, 1, 2]: score += scores[scorepos] tabs.append(1) else: tabs.append(0) scorepos += 1 for ringsize in profile[2:]: if ringsize != 0: score += scores[scorepos] tabs.append(1) else: tabs.append(0) scorepos += 1 return score, tabs
def acquire_rodeo_heuristics( cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str, core: str, domains: Dict[str, int]) -> Tuple[int, List[float], List[int]]: """Calculate heuristic scores for RODEO""" tabs = [] score = 0 precursor = leader + core # Calcd. precursor peptide mass (Da) precursor_analysis = utils.RobustProteinAnalysis(precursor, monoisotopic=True, ignore_invalid=False) tabs.append(float(precursor_analysis.molecular_weight())) # Calcd. leader peptide mass (Da) leader_analysis = utils.RobustProteinAnalysis(leader, monoisotopic=True, ignore_invalid=False) tabs.append(float(leader_analysis.molecular_weight())) # Calcd. core peptide mass (Da) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF04055'] distance = utils.distance_to_pfam(cluster.parent_record, query, hmmer_profiles) tabs.append(distance) # rSAM within 500 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500: score += 1 tabs.append(1) else: tabs.append(0) # rSAM within 150 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150: score += 1 tabs.append(1) else: tabs.append(0) # rSAM further than 1000 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \ utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000: score -= 2 tabs.append(1) else: tabs.append(0) # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60 if "C" not in precursor: score -= 2 tabs += [0, 1] elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60: score += 2 tabs += [1, 0] else: score -= 2 tabs += [0, 1] # Three or more Cys; Less than 3 Cys if precursor.count("C") >= 3: score += 4 tabs += [1, 0] else: score -= 4 tabs += [0, 1] # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1), ('CC', -2), ('CCC', -2)) for motif in motifs: if re.search(motif[0], core): score += motif[1] tabs.append(1) else: tabs.append(0) # No Cys in last 1/4th? quarter_length = -len(precursor) // 4 if "C" not in precursor[quarter_length:]: score += 1 tabs.append(1) else: score -= 1 tabs.append(0) # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor two_thirds = 2 * len(precursor) // 3 if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count( "C") == 1: score += 1 tabs.append(1) else: tabs.append(0) # Peptide matches SboA hmm if cds_has_domains(query, {"Subtilosin_A"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SkfA hmm if cds_has_domains(query, {"TIGR04404"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SCIFF hmm if cds_has_domains(query, {"TIGR03973"}): score += 2 tabs.append(1) else: tabs.append(0) # cluster has PqqD/RRE (PF05402) if "PF05402" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has SPASM domain (PF13186) if "PF13186" in domains: score += 1 tabs.append(1) else: tabs.append(0) # PF04055 (rSAM) domain start > 80 runresults = subprocessing.run_hmmsearch( path.get_full_path(__file__, "data", "PF04055.hmm"), fasta.get_fasta_from_features(cluster.cds_children)) max_start = 0 hitstarts = [] hitends = [] for runresult in runresults: # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.bitscore > 40: hitstarts.append(hsp.hit_start) max_start = max(hsp.hit_start, max_start) hitends.append(hsp.hit_end) if hitstarts and max_start > 80: score += 1 tabs.append(1) else: tabs.append(0) # cluster has peptidase peptidase_domains = [ "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41" ] no_peptidase = True for pepdom in peptidase_domains: if pepdom in domains: score += 1 tabs.append(1) no_peptidase = False else: tabs.append(0) # cluster has transporter transport_domains = ["PF00005", "PF00664"] for transpdom in transport_domains: if transpdom in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has response regulator (PF00072) if "PF00072" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has major facilitator (PF07690) if "PF07690" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has ATPase (PF13304) if "PF13304" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has Fer4_12 (PF13353) if "PF13353" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has rSAM (PF04055) if "PF04055" in domains or "TIGR03975" in domains: score += 2 tabs.append(1) else: tabs.append(0) # cluster has no recognized peptidase if no_peptidase: score -= 2 tabs.append(1) else: tabs.append(0) # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus # the binary opposite is also included as the next field last_motif_c = 0 index = -1 for aa in reversed(precursor): if aa == "C" and "C" in precursor[index - 6:index]: last_motif_c = index + 1 index -= 1 if 0.35 <= last_motif_c / len(precursor) <= 0.65: score += 3 tabs += [0, 1] else: score -= 2 tabs += [1, 0] # SS profile count > 1 # is there more than one Cx..C structure in the sequence cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER) rex4 = re.compile(cysrex) if len(rex4.findall(core)) > 1: score += 2 tabs.append(1) else: tabs.append(0) return score, tabs, hitends
def generate_rodeo_svm_csv( record: Record, query: CDSFeature, leader: str, core: str, previously_gathered_tabs: List[Union[float, int]], fimo_motifs: List[int], fimo_scores: Dict[int, float]) -> List[Union[float, int]]: """Generates all the items for a single precursor peptide candidate""" columns = [] # type: List[Union[float, int]] # Precursor Index columns.append(1) # classification columns.append(0) columns += previously_gathered_tabs # Cluster has PF00733? if utils.distance_to_pfam(record, query, ['PF00733']) == -1 or \ utils.distance_to_pfam(record, query, ['PF00733']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF05402? if utils.distance_to_pfam(record, query, ['PF05402']) == -1 or \ utils.distance_to_pfam(record, query, ['PF05402']) > 10000: columns.append(0) else: columns.append(1) # Cluster has PF13471? if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: columns.append(0) else: columns.append(1) # Leader has LxxxxxT motif? if re.search('(L[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): columns.append(1) else: columns.append(0) # Core has adjacent identical aas (doubles)? if any(core[i] == core[i + 1] for i in range(len(core) - 1)): columns.append(1) else: columns.append(0) # Core length (aa) columns.append(len(core)) # Leader length (aa) columns.append(len(leader)) # Precursor length (aa) columns.append(len(leader) + len(core)) # Leader/core ratio columns.append(len(core) / len(leader)) # Number of Pro in first 9 aa of core? columns.append(core[:9].count("P")) # Estimated core charge charge_dict = {"E": -1, "D": -1, "K": 1, "H": 1, "R": 1} columns.append(sum([charge_dict[aa] for aa in core if aa in charge_dict])) # Estimated leader charge columns.append(sum([charge_dict[aa] for aa in leader if aa in charge_dict])) # Estimated precursor charge columns.append( sum([charge_dict[aa] for aa in leader + core if aa in charge_dict])) # Absolute value of core charge columns.append( abs(sum([charge_dict[aa] for aa in core if aa in charge_dict]))) # Absolute value of leader charge columns.append( abs(sum([charge_dict[aa] for aa in leader if aa in charge_dict]))) # Absolute value of precursor charge columns.append( abs(sum([charge_dict[aa] for aa in leader + core if aa in charge_dict]))) # Counts of AAs in leader columns += [leader.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in leader columns.append(sum([leader.count(aa) for aa in "FWY"])) # Neg charged in leader columns.append(sum([leader.count(aa) for aa in "DE"])) # Pos charged in leader columns.append(sum([leader.count(aa) for aa in "RK"])) # Charged in leader columns.append(sum([leader.count(aa) for aa in "RKDE"])) # Aliphatic in leader columns.append(sum([leader.count(aa) for aa in "GAVLMI"])) # Hydroxyl in leader columns.append(sum([leader.count(aa) for aa in "ST"])) # Counts of AAs in core columns += [core.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Aromatics in core columns.append(sum([core.count(aa) for aa in "FWY"])) # Neg charged in core columns.append(sum([core.count(aa) for aa in "DE"])) # Pos charged in core columns.append(sum([core.count(aa) for aa in "RK"])) # Charged in core columns.append(sum([core.count(aa) for aa in "RKDE"])) # Aliphatic in core columns.append(sum([core.count(aa) for aa in "GAVLMI"])) # Hydroxyl in core columns.append(sum([core.count(aa) for aa in "ST"])) # Counts (0 or 1) of amino acids within first AA position of core sequence columns += [core[0].count(aa) for aa in "ARDNCQEGHILKMFPSTWYV"] # Counts of AAs in leader+core precursor = leader + core columns += [precursor.count(aa) for aa in "ARDNCQEGHILKMFPSTWYV" ] # Temp to work with current training CSV # Aromatics in precursor columns.append(sum([precursor.count(aa) for aa in "FWY"])) # Neg charged in precursor columns.append(sum([precursor.count(aa) for aa in "DE"])) # Pos charged in precursor columns.append(sum([precursor.count(aa) for aa in "RK"])) # Charged in precursor columns.append(sum([precursor.count(aa) for aa in "RKDE"])) # Aliphatic in precursor columns.append(sum([precursor.count(aa) for aa in "GAVLMI"])) # Hydroxyl in precursor columns.append(sum([precursor.count(aa) for aa in "ST"])) # Motifs columns += [1 if motif in fimo_motifs else 0 for motif in range(1, 17)] # Total motifs hit columns.append(len(fimo_motifs)) # Motif scores columns += [ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ] # Sum of MEME scores columns.append( sum([ fimo_scores[motif] if motif in fimo_motifs else 0 for motif in range(1, 17) ])) # No Motifs? if not fimo_motifs: columns.append(1) else: columns.append(0) # Alternate Start Codon? if not str(query.extract(record.seq)).startswith("ATG"): columns.append(1) else: columns.append(0) return columns
def acquire_rodeo_heuristics(record: Record, cluster: Cluster, query: CDSFeature, leader: str, core: str) -> Tuple[int, List[Union[float, int]]]: """Calculate heuristic scores for RODEO""" tabs = [] # type: List[Union[float, int]] score = 0 # Calcd. lasso peptide mass (Da) (with Xs average out) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF13471', 'PF00733', 'PF05402'] distance = utils.distance_to_pfam(record, query, hmmer_profiles) tabs.append(distance) # Within 500 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 500: score += 1 tabs.append(1) else: tabs.append(0) # Within 150 nucleotides of any biosynthetic protein (E, B, C) +1 if distance < 150: score += 1 tabs.append(1) else: tabs.append(0) # Greater than 1000 nucleotides from every biosynthetic protein (E, B, C) -2 if distance > 1000: score -= 2 tabs.append(1) else: tabs.append(0) # Core region has 2 or 4 Cys residues +1 if core.count("C") in [2, 4]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region is longer than core region +2 if len(leader) > len(core): score += 2 tabs.append(1) else: tabs.append(0) # Core has 7 (Glu) or 8(Glu/Asp) or 9 (Asp) membered ring possible +1 if 'E' in core[6:8] or 'D' in core[7:9]: score += 1 tabs.append(1) else: tabs.append(0) # Leader region contains GxxxxxT +3 if re.search('(G[ARNDBCEQZGHILKMFPSTWYV]{5}T)', leader): score += 3 tabs.append(1) else: tabs.append(0) # Core starts with G +2 if core.startswith("G"): score += 2 tabs.append(1) else: tabs.append(0) # Peptide and lasso cyclase are on same strand +1 if is_on_same_strand_as(cluster, query, 'PF00733'): score += 1 tabs.append(1) else: tabs.append(0) # Leader/core region length ratio < 2 and > 0.5 +1 if 0.5 <= len(leader) / len(core) <= 2: score += 1 tabs.append(1) else: tabs.append(0) # Core starts with Cys and has an even number of Cys 0 if core.startswith("C") and core.count("C") % 2 == 0: score += 0 tabs.append(1) else: tabs.append(0) # Core contains no Gly -4 if "G" not in core: score -= 4 tabs.append(1) else: tabs.append(0) # Core has at least one aromatic residue +1 if set("FWY") & set(core): score += 1 tabs.append(1) else: tabs.append(0) # Core has at least 2 aromatic residues +2 if sum([core.count(aa) for aa in list("FWY")]) >= 2: score += 2 tabs.append(1) else: tabs.append(0) # Core has odd number of Cys -2 if core.count("C") % 2 != 0: score -= 2 tabs.append(1) else: tabs.append(0) # Leader region contains Trp -1 if "W" in leader: score -= 1 tabs.append(1) else: tabs.append(0) # Leader region contains Lys +1 if "K" in leader: score += 1 tabs.append(1) else: tabs.append(0) # Leader region has Cys -2 if "C" in leader: score -= 2 tabs.append(1) else: tabs.append(0) # Gene cluster does not contain PF13471 -2 if utils.distance_to_pfam(record, query, ['PF13471']) == -1 or \ utils.distance_to_pfam(record, query, ['PF13471']) > 10000: score -= 2 # Peptide utilizes alternate start codon -1 if not str(query.extract(record.seq)).startswith("ATG"): score -= 1 return score, tabs