def test_with_secmet(self): domains = [ SecMetQualifier.Domain("testA", 0.1, 1.1, 3, "test"), SecMetQualifier.Domain("testB", 5.1, 3.9, 5, "dummy") ] self.cds.sec_met = SecMetQualifier(domains) bio = self.convert() assert "sec_met" not in bio.qualifiers # again, detecting leftover legacy versions assert len(bio.qualifiers["sec_met_domain"]) == 2 assert bio.qualifiers["sec_met_domain"] == list(map(str, domains)) regen = CDSFeature.from_biopython(bio) assert regen.sec_met assert len(regen.sec_met.domains) == len(domains) assert regen.sec_met.domains == domains
def annotate_orfs(cds_features: List[secmet.CDSFeature], hmm_results: Dict[str, List[HSP]]) -> None: """ Annotates newly found ORFs with sactipeptide domain information. This is only relevant for CDS features that did not exist during the cluster detection stage of antiSMASH. """ domains_by_feature: Dict[str, List[SecMetQualifier.Domain]] = defaultdict(list) for hit_id, results in hmm_results.items(): for result in results: domain = SecMetQualifier.Domain(result.query_id, result.evalue, result.bitscore, 0, "sactipeptides") domains_by_feature[hit_id].append(domain) for cds in cds_features: domains = domains_by_feature[cds.get_name()] if domains: cds.sec_met = SecMetQualifier(domains)
def get_domains_for_cds(cds: CDSFeature) -> List[SecMetQualifier.Domain]: domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append(SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) return domains
def detect_protoclusters_and_signatures(record: Record, signature_file: str, seeds_file: str, rule_files: List[str], filter_file: str, tool: str) -> RuleDetectionResults: """ Compares all CDS features in a record with HMM signatures and generates Protocluster features based on those hits and the current protocluster detection rules. Arguments: record: the record to analyse signature_file: a tab separated file; each row being a single HMM reference with columns: label, description, minimum score cutoff, hmm path seeds_file: the file containing all HMM profiles rule_files: the files containing the rules to use for cluster definition filter_file: a file containing equivalence sets of HMMs tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters) """ if not rule_files: raise ValueError("rules must be provided") full_fasta = fasta.get_fasta_from_record(record) # if there's no CDS features, don't try to do anything if not full_fasta: return RuleDetectionResults({}, tool) sig_by_name = { sig.name: sig for sig in get_signature_profiles(signature_file) } rules = [] # type: List[rule_parser.DetectionRule] for rule_file in rule_files: rules = create_rules(rule_file, set(sig_by_name), rules) results = [] results_by_id = {} # type: Dict[str, HSP] runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: raise ValueError( 'Failed to find signature for ID %s / ACC %s' % (hsp.query_id, acc)) if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) # Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name)) # Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) # Use rules to determine gene clusters cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules( record, results_by_id, rules) # Find number of sequences on which each pHMM is based num_seeds_per_hmm = get_sequence_counts(signature_file) # Save final results to record rules_by_name = {rule.name: rule for rule in rules} clusters = find_protoclusters(record, cluster_type_hits, rules_by_name) strip_inferior_domains(cds_domains_by_cluster, rules_by_name) cds_results_by_cluster = {} for cluster in clusters: cds_results = [] for cds in record.get_cds_features_within_location(cluster.location): domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append( SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) if domains: cds_results.append( CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {}))) cds_results_by_cluster[cluster] = cds_results return RuleDetectionResults(cds_results_by_cluster, tool)