def run_non_biosynthetic_phmms(fasta: str) -> Dict[str, List[str]]: """ Finds lanthipeptide-specific domains in the input fasta Arguments: fasta: a string containing gene sequences in fasta format Returns: a dictionary mapping the hit id to a list of matching query ids """ with open( path.get_full_path(__file__, "data", "non_biosyn_hmms", "hmmdetails.txt"), "r") as handle: hmmdetails = [ line.strip().split("\t") for line in handle if line.count("\t") == 3 ] signature_profiles = [ HmmSignature(details[0], details[1], int(details[2]), details[3]) for details in hmmdetails ] non_biosynthetic_hmms_by_id = defaultdict( list) # type: Dict[str, List[str]] for sig in signature_profiles: sig.path = path.get_full_path(__file__, "data", "non_biosyn_hmms", sig.path.rpartition(os.sep)[2]) runresults = subprocessing.run_hmmsearch(sig.path, fasta) for runresult in runresults: # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.bitscore > sig.cutoff: non_biosynthetic_hmms_by_id[hsp.hit_id].append( hsp.query_id) return non_biosynthetic_hmms_by_id
def run_non_biosynthetic_phmms(cluster_fasta: str) -> Dict[str, List[HSP]]: """ Try to identify cleavage site using pHMM """ if not cluster_fasta: return {} with open( path.get_full_path(__file__, "data", "non_biosyn_hmms", "hmmdetails.txt"), "r") as handle: hmmdetails = [ line.split("\t") for line in handle.read().splitlines() if line.count("\t") == 3 ] signature_profiles = [ HmmSignature(details[0], details[1], int(details[2]), details[3]) for details in hmmdetails ] non_biosynthetic_hmms_by_id = defaultdict(list) # type: Dict[str, Any] for sig in signature_profiles: sig.path = path.get_full_path(__file__, "data", "non_biosyn_hmms", sig.path) runresults = subprocessing.run_hmmsearch(sig.path, cluster_fasta) for runresult in runresults: # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.bitscore > sig.cutoff: non_biosynthetic_hmms_by_id[hsp.hit_id].append(hsp) return non_biosynthetic_hmms_by_id
def acquire_rodeo_heuristics( cluster: secmet.Protocluster, query: secmet.CDSFeature, leader: str, core: str, domains: Dict[str, int]) -> Tuple[int, List[float], List[int]]: """Calculate heuristic scores for RODEO""" tabs = [] score = 0 precursor = leader + core # Calcd. precursor peptide mass (Da) precursor_analysis = utils.RobustProteinAnalysis(precursor, monoisotopic=True, ignore_invalid=False) tabs.append(float(precursor_analysis.molecular_weight())) # Calcd. leader peptide mass (Da) leader_analysis = utils.RobustProteinAnalysis(leader, monoisotopic=True, ignore_invalid=False) tabs.append(float(leader_analysis.molecular_weight())) # Calcd. core peptide mass (Da) core_analysis = utils.RobustProteinAnalysis(core, monoisotopic=True, ignore_invalid=False) tabs.append(float(core_analysis.molecular_weight())) # Distance to any biosynthetic protein (E, B, C) hmmer_profiles = ['PF04055'] distance = utils.distance_to_pfam(cluster.parent_record, query, hmmer_profiles) tabs.append(distance) # rSAM within 500 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 500: score += 1 tabs.append(1) else: tabs.append(0) # rSAM within 150 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) < 150: score += 1 tabs.append(1) else: tabs.append(0) # rSAM further than 1000 nt? if utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) == -1 or \ utils.distance_to_pfam(cluster.parent_record, query, ['PF04055']) > 10000: score -= 2 tabs.append(1) else: tabs.append(0) # Ratio of N-term to 1st Cys 0.25<x<0.60; Ratio of N-term to 1st Cys <0.25 or >0.60 if "C" not in precursor: score -= 2 tabs += [0, 1] elif 0.25 <= precursor.find("C") / len(precursor) <= 0.60: score += 2 tabs += [1, 0] else: score -= 2 tabs += [0, 1] # Three or more Cys; Less than 3 Cys if precursor.count("C") >= 3: score += 4 tabs += [1, 0] else: score -= 4 tabs += [0, 1] # CxC/CxxC/CxxxC/CxxxxxC; # CC/CCC motifs = (('C.{5}C', 2), ('C.{3}C', 1), ('C.{2}C', 1), ('C.{1}C', 1), ('CC', -2), ('CCC', -2)) for motif in motifs: if re.search(motif[0], core): score += motif[1] tabs.append(1) else: tabs.append(0) # No Cys in last 1/4th? quarter_length = -len(precursor) // 4 if "C" not in precursor[quarter_length:]: score += 1 tabs.append(1) else: score -= 1 tabs.append(0) # 2 Cys in first 2/3rds of precursor, 1 Cys in last 1/3rd of precursor two_thirds = 2 * len(precursor) // 3 if precursor[:two_thirds].count("C") == 2 and precursor[two_thirds:].count( "C") == 1: score += 1 tabs.append(1) else: tabs.append(0) # Peptide matches SboA hmm if cds_has_domains(query, {"Subtilosin_A"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SkfA hmm if cds_has_domains(query, {"TIGR04404"}): score += 3 tabs.append(1) else: tabs.append(0) # Peptide matches SCIFF hmm if cds_has_domains(query, {"TIGR03973"}): score += 2 tabs.append(1) else: tabs.append(0) # cluster has PqqD/RRE (PF05402) if "PF05402" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has SPASM domain (PF13186) if "PF13186" in domains: score += 1 tabs.append(1) else: tabs.append(0) # PF04055 (rSAM) domain start > 80 runresults = subprocessing.run_hmmsearch( path.get_full_path(__file__, "data", "PF04055.hmm"), fasta.get_fasta_from_features(cluster.cds_children)) max_start = 0 hitstarts = [] hitends = [] for runresult in runresults: # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.bitscore > 40: hitstarts.append(hsp.hit_start) max_start = max(hsp.hit_start, max_start) hitends.append(hsp.hit_end) if hitstarts and max_start > 80: score += 1 tabs.append(1) else: tabs.append(0) # cluster has peptidase peptidase_domains = [ "Peptidase_M16_C", "Peptidase_S8", "Peptidase_M16", "Peptidase_S41" ] no_peptidase = True for pepdom in peptidase_domains: if pepdom in domains: score += 1 tabs.append(1) no_peptidase = False else: tabs.append(0) # cluster has transporter transport_domains = ["PF00005", "PF00664"] for transpdom in transport_domains: if transpdom in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has response regulator (PF00072) if "PF00072" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has major facilitator (PF07690) if "PF07690" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has ATPase (PF13304) if "PF13304" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has Fer4_12 (PF13353) if "PF13353" in domains: score += 1 tabs.append(1) else: tabs.append(0) # cluster has rSAM (PF04055) if "PF04055" in domains or "TIGR03975" in domains: score += 2 tabs.append(1) else: tabs.append(0) # cluster has no recognized peptidase if no_peptidase: score -= 2 tabs.append(1) else: tabs.append(0) # C-terminal portion is < 0.35 or > 0.65; C-terminal portion is defined as # the part from the last cysteine in the last identified Cx(n)C motif to the C-terminus # the binary opposite is also included as the next field last_motif_c = 0 index = -1 for aa in reversed(precursor): if aa == "C" and "C" in precursor[index - 6:index]: last_motif_c = index + 1 index -= 1 if 0.35 <= last_motif_c / len(precursor) <= 0.65: score += 3 tabs += [0, 1] else: score -= 2 tabs += [1, 0] # SS profile count > 1 # is there more than one Cx..C structure in the sequence cysrex = '(?=(C.{%d,%d}C))' % (CHAIN_LOWER, CHAIN_UPPER) rex4 = re.compile(cysrex) if len(rex4.findall(core)) > 1: score += 2 tabs.append(1) else: tabs.append(0) return score, tabs, hitends
def detect_protoclusters_and_signatures(record: Record, signature_file: str, seeds_file: str, rule_files: List[str], valid_categories: Set[str], filter_file: str, tool: str, annotate_existing_subregions: bool = True) -> RuleDetectionResults: """ Compares all CDS features in a record with HMM signatures and generates Protocluster features based on those hits and the current protocluster detection rules. Arguments: record: the record to analyse signature_file: a tab separated file; each row being a single HMM reference with columns: label, description, minimum score cutoff, hmm path seeds_file: the file containing all HMM profiles rule_files: the files containing the rules to use for cluster definition valid_categories: a set containing valid rule category strings filter_file: a file containing equivalence sets of HMMs tool: the name of the tool providing the HMMs (e.g. rule_based_clusters) annotate_existing_subregions: if True, subregions already present in the record will have domains annotated even if no protocluster is found """ if not rule_files: raise ValueError("rules must be provided") full_fasta = fasta.get_fasta_from_record(record) # if there's no CDS features, don't try to do anything if not full_fasta: return RuleDetectionResults({}, tool, []) sig_by_name = {sig.name: sig for sig in get_signature_profiles(signature_file)} rules: List[rule_parser.DetectionRule] = [] aliases: Dict[str, List[rule_parser.Token]] = {} for rule_file in rule_files: rules = create_rules(rule_file, set(sig_by_name), valid_categories, aliases, rules) results = [] results_by_id: Dict[str, HSP] = {} runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: raise ValueError('Failed to find signature for ID %s / ACC %s' % ( hsp.query_id, acc)) if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) # Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name)) # Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) # Use rules to determine gene clusters cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(record, results_by_id, rules) # Find number of sequences on which each pHMM is based num_seeds_per_hmm = get_sequence_counts(signature_file) # annotate everything in detected protoclusters rules_by_name = {rule.name: rule for rule in rules} clusters = find_protoclusters(record, cluster_type_hits, rules_by_name) strip_inferior_domains(cds_domains_by_cluster, rules_by_name) def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]: domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) return domains cds_results_by_cluster = {} cdses_with_annotations = set() for cluster in clusters: cds_results = [] for cds in record.get_cds_features_within_location(cluster.location): domains = get_domains_for_cds(cds) if domains: cds_results.append(CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {}))) cdses_with_annotations.add(cds) cds_results_by_cluster[cluster] = cds_results # add detected profile annotations for any existing subregions, if enabled cds_results_outside_clusters = [] if annotate_existing_subregions: for subregion in record.get_subregions(): for cds in subregion.cds_children: if cds in cdses_with_annotations: continue domains = get_domains_for_cds(cds) if domains: cds_results_outside_clusters.append(CDSResults(cds, domains, {})) cdses_with_annotations.add(cds) return RuleDetectionResults(cds_results_by_cluster, tool, cds_results_outside_clusters)
def detect_borders_and_signatures(record: Record, signature_file: str, seeds_file: str, rules_file: str, filter_file: str, tool: str) -> RuleDetectionResults: """ Compares all CDS features in a record with HMM signatures and generates Cluster features based on those hits and the current cluster detection rules. Arguments: record: the record to analyse signature_file: a tab separated file; each row being a single HMM reference with columns: label, description, minimum score cutoff, hmm path seeds_file: the file containing all HMM profiles rules_file: the file containing all the rules to use for cluster definition filter_file: a file containing equivalence sets of HMMs tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters) """ full_fasta = fasta.get_fasta_from_record(record) # if there's no CDS features, don't try to do anything if not full_fasta: return None sig_by_name = { sig.name: sig for sig in get_signature_profiles(signature_file) } rules = create_rules(rules_file, set(sig_by_name)) results = [] results_by_id = {} # type: Dict[str, HSP] runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: raise ValueError( 'Failed to find signature for ID %s / ACC %s' % (hsp.query_id, acc)) if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) # Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name)) # Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) # Use rules to determine gene clusters cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules( record, results_by_id, rules) # Find number of sequences on which each pHMM is based num_seeds_per_hmm = get_sequence_counts(signature_file) # Save final results to record rules_by_name = {rule.name: rule for rule in rules} clusters = find_clusters(record, cluster_type_hits, rules_by_name) strip_inferior_domains(cds_domains_by_cluster, rules_by_name) cds_results_by_cluster = {} for cluster in clusters: record.add_cluster_border(cluster) cds_results = [] cluster_extent = FeatureLocation( cluster.location.start - cluster.extent, cluster.location.end + cluster.extent) for cds in record.get_cds_features_within_location(cluster_extent): domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append( SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) if domains: cds_results.append( CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {}))) cds_results_by_cluster[cluster] = cds_results return RuleDetectionResults(cds_results_by_cluster, tool)