def get_detected_domains(cluster: secmet.Protocluster) -> Dict[str, int]: """ Gathers all detected domain ids from a cluster. Includes detection of some extra HMM profiles specific to sactipeptides. Arguments: cluster: the Protocluster to gather domains from Returns: a dictionary mapping domain ids to number of times that domain was found """ found_domains = {} # type: Dict[str, int] # Gather biosynthetic domains for feature in cluster.cds_children: if not feature.sec_met: continue for domain_id in feature.sec_met.domain_ids: found_domains[domain_id] = found_domains.get(domain_id, 0) + 1 # Gather non-biosynthetic domains non_biosynthetic_hmms_by_id = run_non_biosynthetic_phmms( fasta.get_fasta_from_features(cluster.cds_children)) for hsps_found_for_this_id in non_biosynthetic_hmms_by_id.values(): for hsp in hsps_found_for_this_id: found_domains[hsp.query_id] = found_domains.get(hsp.query_id, 0) + 1 return found_domains
def get_alignments(self) -> List[Alignment]: """ Builds an Alignment for each hit in the results of running the provided command on the provided data. """ if not self.domains_of_interest: return [] # for safety of the tools, rename long domain names to a simple numeric index data = fasta.get_fasta_from_features(self.domains_of_interest, numeric_names=True) assert data, "empty fasta created" extra_args = ["-T", "0", # min score "-E", "0.1"] # max evalue results = subprocessing.run_hmmpfam2(self.database, data, extra_args=extra_args) alignments = [] for result in results: if not result.hsps: continue assert result.id == result.hsps[0].aln[0].id # fetch back the real domain from the numeric index used in the fasta domain = self.domains_of_interest[int(result.id)] alignments.append(Alignment(domain, result.hsps[0].aln[0].seq, result.hsps[0].aln[1].seq, result.hsps[0].hit_start, result.hsps[0].hit_end)) return alignments
def get_detected_domains(cluster: secmet.Protocluster) -> Set[str]: """ Gathers all detected domain ids from a cluster. Includes detection of some extra HMM profiles specific to thiopeptides. Arguments: cluster: the Cluster to gather domains from Return: a set of domain ids """ found_domains = [] # type: List[str] # Gather biosynthetic domains for feature in cluster.cds_children: if not feature.sec_met: continue found_domains.extend(feature.sec_met.domain_ids) # Gather non-biosynthetic domains cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children) non_biosynthetic_hmms_by_id = run_non_biosynthetic_phmms(cluster_fasta) non_biosynthetic_hmms_found = [] # type: List[str] for hsps_found_for_this_id in non_biosynthetic_hmms_by_id.values(): for hsp in hsps_found_for_this_id: if hsp.query_id not in non_biosynthetic_hmms_found: non_biosynthetic_hmms_found.append(hsp.query_id) found_domains.extend(non_biosynthetic_hmms_found) return set(found_domains)
def generate_domains(record: Record) -> NRPSPKSDomains: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: a NRPSPKSDomains instance containing all found motifs and domain HMMs for each CDS """ results = NRPSPKSDomains(record.id) cds_within_clusters = record.get_cds_features_within_clusters() assert cds_within_clusters # because every cluster should have genes fasta = get_fasta_from_features(cds_within_clusters) cds_domains = find_domains(fasta, record) cds_motifs = find_ab_motifs(fasta) for cds in cds_within_clusters: domains = cds_domains.get(cds.get_name(), []) motifs = cds_motifs.get(cds.get_name(), []) if not (domains or motifs): continue domain_type = classify_cds([domain.hit_id for domain in domains]) results.cds_results[cds] = CDSResult(domains, motifs, domain_type) for cds, cds_result in results.cds_results.items(): cds_result.annotate_domains(record, cds) results.added = True return results
def run_hmmer(record: Record, features: Iterable[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str, filter_overlapping: bool = True ) -> HmmerResults: """ Build hmmer results for the given features Arguments: record: the Record instance to run hmmer over features: the list of CDSFeatures to run over specifically max_evalue: a maximum evalue allowed for hits (exclusive) min_evalue: a minimum evalue allowed for hits (exclusive) database: the database to search for hits within tool: the name of the specific tool calling into this module """ if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence, opts=["--cut_tc"]) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) if filter_overlapping: results_by_cds = defaultdict(list) for hit in hits: results_by_cds[hit.locus_tag].append(hit) cutoffs = pfamdb.get_pfam_cutoffs(database) hits = [] for locus_hits in results_by_cds.values(): hits.extend(remove_overlapping(locus_hits, cutoffs)) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def scan_for_functions(cds_features: List[CDSFeature], database: str, hmmscan_opts: Optional[List[str]] = None) -> Dict[str, HMMResult]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify database: the path to the database to check hmmscan_opts: a list of extra options to provide to hmmscan Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ search_fasta = fasta.get_fasta_from_features(cds_features) results = subprocessing.run_hmmscan(database, search_fasta, hmmscan_opts) hmm_lengths = utils.get_hmm_lengths(database) hmm_results = refine_hmmscan_results(results, hmm_lengths) best_hits = {} # type: Dict[str, HMMResult] for cds in cds_features: cds_name = cds.get_name() hits = hmm_results.get(cds_name) if not hits: continue best_hits[cds_name] = hits[0] return best_hits
def get_detected_domains(genes: List[CDSFeature]) -> List[str]: """ Gathers all detected domains in a cluster, including some not detected by hmm_detection. Arguments: genes: a list of genes to check Returns: a list of strings, each string being the name of a domain in the cluster """ found_domains = [] # type: List[str] # Gather biosynthetic domains for feature in genes: if not feature.sec_met: continue found_domains.extend(feature.sec_met.domain_ids) # Gather non-biosynthetic domains cluster_fasta = get_fasta_from_features(genes) assert cluster_fasta non_biosynthetic_hmms_by_id = run_non_biosynthetic_phmms(cluster_fasta) non_biosynthetic_hmms_found = [] # type: List[str] for hsps_found in non_biosynthetic_hmms_by_id.values(): for hsp in hsps_found: if hsp not in non_biosynthetic_hmms_found: non_biosynthetic_hmms_found.append(hsp) found_domains += non_biosynthetic_hmms_found return found_domains
def specific_analysis(record: secmet.Record, options: ConfigType) -> SactiResults: """ Analyse each sactipeptide cluster and find precursors within it. If an unannotated ORF would contain the precursor, it will be annotated. Arguments: record: the Record to analyse Returns: a SactiResults instance holding all found precursors and new ORFs """ results = SactiResults(record.id) new_feature_hits = 0 motif_count = 0 counter = 0 for cluster in record.get_protoclusters(): if cluster.product != 'sactipeptide': continue # Find candidate ORFs that are not yet annotated new_orfs = all_orfs.find_all_orfs(record, cluster) hmm_results = run_non_biosynthetic_phmms( fasta.get_fasta_from_features(new_orfs)) annotate_orfs(new_orfs, hmm_results) # Get all CDS features to evaluate for RiPP-likeness candidates = list(cluster.cds_children) + new_orfs domains = get_detected_domains(cluster) # Evaluate each candidate precursor peptide for candidate in candidates: motif = run_sactipred(cluster, candidate, domains) if motif is None: continue results.motifs_by_locus[candidate.get_name()].append(motif) motif_count += 1 results.clusters[cluster.get_protocluster_number()].add( candidate.get_name()) # track new CDSFeatures if found with all_orfs if candidate.region is None: results.new_cds_features.add(candidate) new_feature_hits += 1 # Analyze the cluster with RREfinder counter += 1 name = '%s_%s_%s' % (record.id, cluster.product, counter) RRE_main(cluster, results, name, options) if not motif_count: logging.debug("Found no sactipeptide motifs") else: verb = "is" if new_feature_hits == 1 else "are" logging.debug( "Found %d sactipeptide motif(s) in %d feature(s), %d of which %s new", motif_count, len(results.motifs_by_locus), new_feature_hits, verb) return results
def run_hmmer(record: Record, features: List[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str) -> HmmerResults: """ Build hmmer results for the given features""" if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def run_t2pks_hmmscan(cluster: Cluster) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on coding sequences in cluster Arguments: cluster: Cluster on which the type II PKS hmmscan shall be run Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def run_t2pks_hmmscan( cds_features: Iterable[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on the given CDSFeatures Arguments: cluster: Protocluster on which to run the type II PKS hmmscan Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cds_features) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def generate_domains(record: Record) -> NRPSPKSDomains: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: a NRPSPKSDomains instance containing all found motifs and domain HMMs for each CDS """ results = NRPSPKSDomains(record.id) cds_within_regions = record.get_cds_features_within_regions() assert cds_within_regions # because every cluster should have genes fasta = get_fasta_from_features(cds_within_regions) cds_domains = find_domains(fasta, record) cds_ks_subtypes = find_ks_domains(fasta) cds_motifs = find_ab_motifs(fasta) prev: Optional[CDSModuleInfo] = None for cds in cds_within_regions: domains = cds_domains.get(cds.get_name(), []) motifs = cds_motifs.get(cds.get_name(), []) if not (domains or motifs): continue subtype_names = match_subtypes_to_ks_domains( domains, cds_ks_subtypes.get(cds.get_name(), [])) domain_type = classify_cds([domain.hit_id for domain in domains], subtype_names) modules = build_modules_for_cds(domains, subtype_names, cds.get_name()) results.cds_results[cds] = CDSResult(domains, motifs, domain_type, modules, subtype_names) # combine modules that cross CDS boundaries, if possible and relevant info = CDSModuleInfo(cds, modules) if prev and prev.modules and info.modules: combine_modules( info, prev) # modifies the lists of modules linked in each CDSResult prev = info for cds, cds_result in results.cds_results.items(): cds_result.annotate_domains(record, cds) return results
def classify_genes( cds_features: List[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ smcogs_fasta = fasta.get_fasta_from_features(cds_features) smcogs_opts = ["-E", "1E-6"] hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm") smcogs_results = subprocessing.run_hmmscan(hmm_file, smcogs_fasta, smcogs_opts) hmm_lengths = utils.get_hmm_lengths(hmm_file) return refine_hmmscan_results(smcogs_results, hmm_lengths)
def run_hmmer(record: Record, features: Iterable[CDSFeature], max_evalue: float, min_score: float, database: str, tool: str) -> HmmerResults: """ Build hmmer results for the given features Arguments: record: the Record instance to run hmmer over features: the list of CDSFeatures to run over specifically max_evalue: a maximum evalue allowed for hits (exclusive) min_evalue: a minimum evalue allowed for hits (exclusive) database: the database to search for hits within tool: the name of the specific tool calling into this module """ if not os.path.exists(database): raise ValueError("Given database does not exist: %s" % database) query_sequence = fasta.get_fasta_from_features(features) hmmscan_results = subprocessing.run_hmmscan(database, query_sequence, opts=["--cut_tc"]) hits = build_hits(record, hmmscan_results, min_score, max_evalue, database) return HmmerResults(record.id, max_evalue, min_score, database, tool, hits)
def run_starter_unit_blastp( cluster: Cluster, cds_hmm_hits: Dict[str, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cluster: Cluster on which the blastp shall be run cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: None if no starter unit cds are present otherwise a dictionary of key: cds and value: list of HMMresults, for blastp results of the cluster """ starter_unit_cds = {} for cds, hmm_hits in cds_hmm_hits.items(): starter_unit_hit_ids = [ hit.hit_id for hit in hmm_hits if hit.hit_id in ['KSIII', 'AT', 'AMID', 'LIG'] ] if starter_unit_hit_ids: starter_unit_cds[cluster.parent_record.get_cds_by_name( cds)] = starter_unit_hit_ids if starter_unit_cds: blastp_results = [] blastp_fasta_files = set() for cds, starter_unit_hit_ids in starter_unit_cds.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit_id in starter_unit_hit_ids: blast_database = path.get_full_path(__file__, 'data', hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit_id + '.fasta')) fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) return refine_hmmscan_results(blastp_results, fasta_lengths) return {}
def find_diamond_matches( record: Record, database: str) -> Tuple[HitsByCDS, HitsByReferenceName]: """ Runs diamond, comparing all features in the record to the given database Arguments: record: the record to use as a query database: the path of the database to compare to Returns: a tuple of a dictionary mapping CDSFeature to a dictionary mapping reference CDS numeric ID to a list of Hits for that reference a dictionary mapping reference region name to a dictionary mapping reference CDS numeric ID to a list of Hits for that reference """ logging.info("Comparing regions to reference database") extra_args = [ "--compress", "0", "--max-target-seqs", "10000", "--evalue", "1e-05", "--outfmt", "6", # 6 is blast tabular format, just as in blastp ] features = record.get_cds_features_within_regions() with NamedTemporaryFile() as temp_file: temp_file.write( fasta.get_fasta_from_features(features, numeric_names=True).encode()) temp_file.flush() raw = subprocessing.run_diamond_search(temp_file.name, database, mode="blastp", opts=extra_args) return blast_parse(raw, dict(enumerate(features)))
def annotate_domains(record: Record) -> None: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: None """ cds_within_clusters = record.get_cds_features_within_clusters() assert cds_within_clusters # because every cluster should have genes fasta = get_fasta_from_features(cds_within_clusters) cds_domains = find_domains(fasta, record) cds_motifs = find_ab_motifs(fasta) for cds in cds_within_clusters: cds_name = cds.get_name() # gather domains and classify domains = cds_domains.get(cds_name) if not domains: continue domain_type = classify_feature([domain.hit_id for domain in domains]) cds.nrps_pks.type = domain_type for domain in domains: cds.nrps_pks.add_domain(domain) # construct motif features motifs = cds_motifs.get(cds_name) if not motifs: continue motif_features = generate_motif_features(record, cds, motifs) for motif in motif_features: record.add_cds_motif(motif) cds.motifs.extend(motif_features)
def run_starter_unit_blastp( cds_hmm_hits: Dict[CDSFeature, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: a dictionary mapping CDS name to a list of HMMresults """ blastp_results = [] blastp_fasta_files = set() for cds, hmm_hits in cds_hmm_hits.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit in hmm_hits: if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']: continue blast_database = path.get_full_path(__file__, 'data', hit.hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit.hit_id + '.fasta')) if not blastp_results: return {} fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) results = refine_hmmscan_results(blastp_results, fasta_lengths) for hits in results.values(): for i, hit in enumerate(hits): if not hit.hit_id.endswith("-CoA"): hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start, hit.query_end, hit.evalue, hit.bitscore) return results
def acquire_rodeo_heuristics( cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str, core: str, domains: Dict[str, int]) -> Tuple[int, List[float], List[int]]: """Calculate heuristic scores for RODEO""" tabs = [] score = 0 precursor = leader + core # Calcd. precursor peptide mass (Da) precursor_analysis = utils.RobustProteinAnalysis(precursor, monoisotopic=True, ignore_invalid=False) tabs.append(float(precursor_analysis.molecular_weight())) # Calcd. leader peptide mass (Da) leader_analysis = utils.RobustProteinAnalysis(leader, monoisotopic=True, ignore_invalid=False) tabs.append(float(leader_analysis.molecular_weight())) # Calcd. core peptide mass (Da) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF04055'] distance = utils.distance_to_pfam(cluster.parent_record, query, hmmer_profiles) tabs.append(distance) # rSAM within 500 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500: score += 1 tabs.append(1) else: tabs.append(0) # rSAM within 150 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150: score += 1 tabs.append(1) else: tabs.append(0) # rSAM further than 1000 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \ utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000: score -= 2 tabs.append(1) else: tabs.append(0) # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60 if "C" not in precursor: score -= 2 tabs += [0, 1] elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60: score += 2 tabs += [1, 0] else: score -= 2 tabs += [0, 1] # Three or more Cys; Less than 3 Cys if precursor.count("C") >= 3: score += 4 tabs += [1, 0] else: score -= 4 tabs += [0, 1] # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1), ('CC', -2), ('CCC', -2)) for motif in motifs: if re.search(motif[0], core): score += motif[1] tabs.append(1) else: tabs.append(0) # No Cys in last 1/4th? quarter_length = -len(precursor) // 4 if "C" not in precursor[quarter_length:]: score += 1 tabs.append(1) else: score -= 1 tabs.append(0) # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor two_thirds = 2 * len(precursor) // 3 if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count( "C") == 1: score += 1 tabs.append(1) else: tabs.append(0) # Peptide matches SboA hmm if cds_has_domains(query, {"Subtilosin_A"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SkfA hmm if cds_has_domains(query, {"TIGR04404"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SCIFF hmm if cds_has_domains(query, {"TIGR03973"}): score += 2 tabs.append(1) else: tabs.append(0) # cluster has PqqD/RRE (PF05402) if "PF05402" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has SPASM domain (PF13186) if "PF13186" in domains: score += 1 tabs.append(1) else: tabs.append(0) # PF04055 (rSAM) domain start > 80 runresults = subprocessing.run_hmmsearch( path.get_full_path(__file__, "data", "PF04055.hmm"), fasta.get_fasta_from_features(cluster.cds_children)) max_start = 0 hitstarts = [] hitends = [] for runresult in runresults: # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.bitscore > 40: hitstarts.append(hsp.hit_start) max_start = max(hsp.hit_start, max_start) hitends.append(hsp.hit_end) if hitstarts and max_start > 80: score += 1 tabs.append(1) else: tabs.append(0) # cluster has peptidase peptidase_domains = [ "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41" ] no_peptidase = True for pepdom in peptidase_domains: if pepdom in domains: score += 1 tabs.append(1) no_peptidase = False else: tabs.append(0) # cluster has transporter transport_domains = ["PF00005", "PF00664"] for transpdom in transport_domains: if transpdom in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has response regulator (PF00072) if "PF00072" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has major facilitator (PF07690) if "PF07690" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has ATPase (PF13304) if "PF13304" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has Fer4_12 (PF13353) if "PF13353" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has rSAM (PF04055) if "PF04055" in domains or "TIGR03975" in domains: score += 2 tabs.append(1) else: tabs.append(0) # cluster has no recognized peptidase if no_peptidase: score -= 2 tabs.append(1) else: tabs.append(0) # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus # the binary opposite is also included as the next field last_motif_c = 0 index = -1 for aa in reversed(precursor): if aa == "C" and "C" in precursor[index - 6:index]: last_motif_c = index + 1 index -= 1 if 0.35 <= last_motif_c / len(precursor) <= 0.65: score += 3 tabs += [0, 1] else: score -= 2 tabs += [1, 0] # SS profile count > 1 # is there more than one Cx..C structure in the sequence cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER) rex4 = re.compile(cysrex) if len(rex4.findall(core)) > 1: score += 2 tabs.append(1) else: tabs.append(0) return score, tabs, hitends