def find_ks_domains(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of KS domain results for that feature """ opts = ["--cut_tc"] ks_file = path.get_full_path(__file__, "data", "ksdomains.hmm") lengths = utils.get_hmm_lengths(ks_file) domains = subprocessing.run_hmmscan(ks_file, fasta, opts) refine_hmmscan_results(domains, lengths, neighbour_mode=True) raise NotImplementedError("no return value used from refine_hmmscan_results")
def scan_for_functions(cds_features: List[CDSFeature], database: str, hmmscan_opts: Optional[List[str]] = None) -> Dict[str, HMMResult]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify database: the path to the database to check hmmscan_opts: a list of extra options to provide to hmmscan Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ search_fasta = fasta.get_fasta_from_features(cds_features) results = subprocessing.run_hmmscan(database, search_fasta, hmmscan_opts) hmm_lengths = utils.get_hmm_lengths(database) hmm_results = refine_hmmscan_results(results, hmm_lengths) best_hits = {} # type: Dict[str, HMMResult] for cds in cds_features: cds_name = cds.get_name() hits = hmm_results.get(cds_name) if not hits: continue best_hits[cds_name] = hits[0] return best_hits
def test_combined(self): results = refinement.refine_hmmscan_results(self.results, self.hmm_lengths) assert len(results) == 1 assert len(results[self.gene_id]) == 1 best = results[self.gene_id][0] assert best.hit_id == "SMCOG1048:sensor_histidine_kinase" assert best.evalue == 3.6e-13 assert best.bitscore == 43.5 assert best.query_start == 91 assert best.query_end == 390
def find_ab_motifs(fasta: str) -> Dict[str, List[HMMResult]]: """ Analyse for abMotifs Arguments: fasta: a group of features in fasta format Returns: a dictionary mapping feature name to a list of motif results for that feature """ opts = ["-E", "0.25"] motif_file = path.get_full_path(__file__, "data", "abmotifs.hmm") abmotif_results = subprocessing.run_hmmscan(motif_file, fasta, opts) lengths = utils.get_hmm_lengths(motif_file) return refine_hmmscan_results(abmotif_results, lengths, neighbour_mode=True)
def run_t2pks_hmmscan(cluster: Cluster) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on coding sequences in cluster Arguments: cluster: Cluster on which the type II PKS hmmscan shall be run Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cluster.cds_children) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def find_domains(fasta: str, record: Record) -> Dict[str, List[HMMResult]]: """ Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains Arguments: fasta: a group of features in fasta format record: the Record that contains all the features Returns: a dictionary mapping feature name to a list of domain results for that feature """ opts = ["--cut_tc"] nrpspks_file = path.get_full_path(__file__, "data", "nrpspksdomains.hmm") nrpspksdomain_results = subprocessing.run_hmmscan(nrpspks_file, fasta, opts) lengths = utils.get_hmm_lengths(nrpspks_file) domains = refine_hmmscan_results(nrpspksdomain_results, lengths, neighbour_mode=True) return filter_nonterminal_docking_domains(record, domains)
def run_t2pks_hmmscan( cds_features: Iterable[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Runs hmmscan for type II PKS proteins on the given CDSFeatures Arguments: cluster: Protocluster on which to run the type II PKS hmmscan Returns: a dictionary of key: cds and value: list of HMMResults, for hmmscan results of the cluster """ cluster_fasta = fasta.get_fasta_from_features(cds_features) hmm_file = path.get_full_path(__file__, "data", "t2pks.hmm") hmm_results = subprocessing.run_hmmscan(hmm_file, cluster_fasta, opts=['--cut_tc']) hmm_lengths = get_hmm_lengths(hmm_file) return refine_hmmscan_results(hmm_results, hmm_lengths)
def classify_genes( cds_features: List[CDSFeature]) -> Dict[str, List[HMMResult]]: """ Finds possible classifications for the provided genes. Arguments: cds_features: a list of CDSFeatures to classify Returns: a dictionary mapping CDS name to a list of HMMResult instances of classifications """ smcogs_fasta = fasta.get_fasta_from_features(cds_features) smcogs_opts = ["-E", "1E-6"] hmm_file = path.get_full_path(__file__, "data", "smcogs.hmm") smcogs_results = subprocessing.run_hmmscan(hmm_file, smcogs_fasta, smcogs_opts) hmm_lengths = utils.get_hmm_lengths(hmm_file) return refine_hmmscan_results(smcogs_results, hmm_lengths)
def run_starter_unit_blastp( cluster: Cluster, cds_hmm_hits: Dict[str, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cluster: Cluster on which the blastp shall be run cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: None if no starter unit cds are present otherwise a dictionary of key: cds and value: list of HMMresults, for blastp results of the cluster """ starter_unit_cds = {} for cds, hmm_hits in cds_hmm_hits.items(): starter_unit_hit_ids = [ hit.hit_id for hit in hmm_hits if hit.hit_id in ['KSIII', 'AT', 'AMID', 'LIG'] ] if starter_unit_hit_ids: starter_unit_cds[cluster.parent_record.get_cds_by_name( cds)] = starter_unit_hit_ids if starter_unit_cds: blastp_results = [] blastp_fasta_files = set() for cds, starter_unit_hit_ids in starter_unit_cds.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit_id in starter_unit_hit_ids: blast_database = path.get_full_path(__file__, 'data', hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit_id + '.fasta')) fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) return refine_hmmscan_results(blastp_results, fasta_lengths) return {}
def run_starter_unit_blastp( cds_hmm_hits: Dict[CDSFeature, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: a dictionary mapping CDS name to a list of HMMresults """ blastp_results = [] blastp_fasta_files = set() for cds, hmm_hits in cds_hmm_hits.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit in hmm_hits: if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']: continue blast_database = path.get_full_path(__file__, 'data', hit.hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit.hit_id + '.fasta')) if not blastp_results: return {} fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) results = refine_hmmscan_results(blastp_results, fasta_lengths) for hits in results.values(): for i, hit in enumerate(hits): if not hit.hit_id.endswith("-CoA"): hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start, hit.query_end, hit.evalue, hit.bitscore) return results