def run_glimmerhmm(record: Record) -> None: """ Run glimmerhmm on the record, parse the results and add all detected genes to the record """ with TemporaryDirectory(change=True): # glimmerHMM/gff_parser handles some record names poorly (e.g. leading - or only '.') orig_id = record.id record.id = "input" # Write FASTA file and run GlimmerHMM fasta_file = write_search_fasta(record) record.id = orig_id results_text = run_external(fasta_file) if not "CDS" in results_text: return handle = StringIO(results_text) features = get_features_from_file(handle)["input"] for feature in features: record.add_biopython_feature(feature)
def test_parse_all_multi_cluster(self): # test we partition correctly by cluster number sample_data = self.read_sample_data("data/diamond_output_sample_multicluster.txt") clusters_by_number, queries_by_number = core.parse_all_clusters(sample_data, Record(), 0, 0) self.assertEqual(len(clusters_by_number), 3) self.assertEqual(sorted(clusters_by_number), [1, 2, 4]) self.assertEqual(len(queries_by_number), 3) self.assertEqual(sorted(queries_by_number), [1, 2, 4]) for i in [1, 2, 4]: self.assertEqual(len(clusters_by_number[i]), i) self.assertEqual(len(queries_by_number[i]), i)
def test_labyrinthopeptin(self): "Test lanthipeptide prediction for labyrinthopeptin" filename = path.get_full_path(__file__, 'data', 'labyrinthopeptin.gbk') rec = Record.from_biopython(seqio.read(filename), taxon="bacteria") assert not rec.get_cds_motifs() result = run_specific_analysis(rec) motifs = self.gather_all_motifs(result) assert len(motifs) == 2 assert not rec.get_cds_motifs() result.add_to_record(rec) assert len(rec.get_cds_motifs()) == 2
def run_on_record(record: Record, results: Optional[SMCOGTreeResults], options: ConfigType) -> SMCOGTreeResults: """ Generates phylogeny trees of the classifications made by SMCOGs """ if results and isinstance(results, SMCOGTreeResults): return results # create the smcogs output directory if required relative_output_dir = os.path.relpath( os.path.join(options.output_dir, "smcogs"), os.getcwd()) smcogs_dir = os.path.abspath(relative_output_dir) if not os.path.exists(smcogs_dir): os.mkdir(smcogs_dir) nrpspks_genes = record.get_nrps_pks_cds_features() with path.changed_directory(smcogs_dir): trees = generate_trees(smcogs_dir, record.get_cds_features_within_regions(), nrpspks_genes) return SMCOGTreeResults(record.id, relative_output_dir, trees)
def generate_results(record: Record, options: ConfigType) -> ClusterFinderResults: """ Find and construct cluster borders """ rule_clusters = find_rule_based_clusters(record, options) prob_clusters = find_probabilistic_clusters(record, options) new_clusters = [] new_clusters.extend(rule_clusters) for cluster in prob_clusters: new_cluster = ClusterBorder(cluster.location, tool="clusterfinder", probability=cluster.probability, product=PUTATIVE_PRODUCT, high_priority_product=False) new_clusters.append(new_cluster) if options.cf_create_clusters: for border in new_clusters: record.add_cluster_border(border) return ClusterFinderResults(record.id, new_clusters, create=options.cf_create_clusters)
def generate_pfam2go_tooltip(record: Record, feature: CDSFeature) -> List[html_renderer.Markup]: """Create tooltip text for Pfam to Gene Ontologies results.""" go_notes = [] unique_pfams_with_gos = {} for pfam in record.get_pfam_domains_in_cds(feature): if pfam.gene_ontologies: pfam_id = pfam.full_identifier unique_pfams_with_gos[pfam_id] = pfam.gene_ontologies for unique_id, go_qualifier in sorted(unique_pfams_with_gos.items()): go_notes.extend(build_pfam2go_links(go_qualifier, prefix=f"{unique_id}: ")) return list(map(html_renderer.Markup, go_notes))
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None: """Store information about promoter sequences to a SeqRecord""" for promoter in promoters: # remember to account for 0-indexed start location new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1), promoter.end), type="promoter") new_feature.qualifiers = { "locus_tag": promoter.get_gene_names( ), # already a list with one or two elements "seq": [str(promoter.seq)], } if isinstance(promoter, CombinedPromoter): new_feature.qualifiers["note"] = ["bidirectional promoter"] secmet_version = Feature.from_biopython(new_feature) secmet_version.created_by_antismash = True record.add_feature(secmet_version)
def write_search_fasta(record: Record) -> str: """ Constructs a FASTA representation of a record and writes it to a file in the current directory. Returns: the name of the file created """ filename = "{}.fasta".format(record.id) with open(filename, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') return filename
def test_result_conversion(self): nisin = Record.from_genbank(helpers.get_path_to_nisin_with_detection())[0] with open(path.get_full_path(__file__, "data", "nisin.out")) as handle: trimmed_output = handle.read() with patch.object(subprocessing, "run_diamond_search", return_value=trimmed_output): results = cluster_compare.run_on_record(nisin, None, self.options) assert results.by_database["MIBiG"].by_region[1] # ensure JSON conversion of results gives the same result raw = json.loads(json.dumps(results.to_json())) regenerated = cluster_compare.regenerate_previous_results(raw, nisin, self.options) regen_raw = json.loads(json.dumps(regenerated.to_json())) assert regen_raw == raw
def test_sco_cluster3(self): "Test lanthipeptide prediction for SCO cluster #3" filename = path.get_full_path(__file__, 'data', 'sco_cluster3.gbk') rec = Record.from_biopython(seqio.read(filename), taxon="bacteria") assert not rec.get_cds_motifs() result = run_specific_analysis(rec) motifs = self.gather_all_motifs(result) assert len(motifs) == 1 assert not rec.get_cds_motifs() result.add_to_record(rec) assert len(rec.get_cds_motifs()) == 1 self.assertEqual('Class I', motifs[0].peptide_subclass)
def check_content(sequence: Record) -> Record: """ Checks if the sequence of a record is correct for the input type. If not the record's skip flag will be marked. Arguments: record: the Record instance to check Returns: the Record instance provided """ cdsfeatures = sequence.get_cds_features() cdsfeatures_with_translations = len( [cds for cds in cdsfeatures if cds.translation]) assert cdsfeatures_with_translations == len(cdsfeatures) if not isinstance(sequence.seq.alphabet, Bio.Alphabet.NucleotideAlphabet)\ and not is_nucl_seq(sequence.seq): logging.error("Record %s is a protein record, skipping.", sequence.id) sequence.skip = "protein record" else: sequence.seq.alphabet = Bio.Alphabet.generic_dna return sequence
def convert_tta_codons(tta_codons: List[Feature], record: Record) -> List[Dict[str, Any]]: """Convert found TTA codon features to JSON""" js_codons = [] for codon in tta_codons: cdses = record.get_cds_features_within_location(codon.location, with_overlapping=True) js_codons.append({ 'start': codon.location.start + 1, 'end': codon.location.end, 'strand': codon.strand if codon.strand is not None else 1, 'containedBy': [cds.get_name() for cds in cdses] }) return js_codons
def run_and_regenerate_results_for_module(input_file, module, options, expected_record_count=1, callback=None): """ Runs antismash end to end over the given file with the given options and returns the given modules regenerated results if callback is supplied, it will be called with the output directory path as an argument before the output directory is cleared """ with TemporaryDirectory(change=True) as tempdir: orig_output = options.output_dir update_config({"output_dir": tempdir}) base_filename = os.path.join( options.output_dir, os.path.basename(input_file).rsplit('.', 1)[0]) json_filename = base_filename + ".json" assert not os.path.exists(json_filename) try: antismash.main.run_antismash(input_file, options) except: update_config({"output_dir": orig_output}) raise update_config({"output_dir": orig_output}) results = serialiser.AntismashResults.from_file(json_filename) # remove things that were added by results, because otherwise the add isn't tested by detection # result regeneration for record in results.records: record.strip_antismash_annotations() if callback: callback(tempdir) # and while the genbank output still exists, grab that and check it's readable assert len(Record.from_genbank(base_filename + ".gbk")) == expected_record_count # not the responsibility of modules, but if it's wrong then everything is assert len(results.results) == expected_record_count assert len(results.records) == expected_record_count # ensure all detection stages add their relevant parts modules_to_regenerate = antismash.main.get_detection_modules() final = [] for record, rec_results in zip(results.records, results.results): regenerate_results_for_record(record, options, modules_to_regenerate, module, rec_results) # post (other) detection has run, regenerate (since they may need regions etc) final.append( module.regenerate_previous_results( rec_results.get(module.__name__), record, options)) for res in final: assert isinstance(res, ModuleResults) if expected_record_count == 1: return final[0] return final
def find_all_orfs(record: Record, cluster: Optional[Cluster] = None) -> List[CDSFeature]: """ Find all ORFs of at least 60 bases that don't overlap with existing CDS features. Can (and should) be limited to just within a cluster. Arguments: record: the record to search cluster: the specific Cluster to search within, or None Returns: a list of CDSFeatures, one for each ORF """ # Get sequence for the range offset = 0 seq = record.seq existing = record.get_cds_features() if cluster: seq = record.seq[cluster.location.start:cluster.location.end] offset = cluster.location.start existing = tuple(cluster.cds_children) # Find orfs throughout the range forward_matches = scan_orfs(seq, 1, offset) reverse_matches = scan_orfs(seq.reverse_complement(), -1, offset) locations = forward_matches + reverse_matches orfnr = 1 new_features = [] for location in locations: if cluster: if isinstance(location.start, (BeforePosition, AfterPosition)): continue if isinstance(location.end, (BeforePosition, AfterPosition)): continue dummy_feature = Feature(location, feature_type="dummy") # skip if overlaps with existing CDSs if any(dummy_feature.overlaps_with(cds) for cds in existing): continue feature = create_feature_from_location(record, location, orfnr) # skip if not wholly contained in the cluster if cluster and not feature.is_contained_by(cluster): continue new_features.append(feature) orfnr += 1 return new_features
def analyse_biosynthetic_order(nrps_pks_features: List[CDSFeature], consensus_predictions: Dict[str, str], record: Record) -> Dict[int, Tuple[str, bool]]: """ For each NRPS or PKS cluster, determines if that cluster is docking or not then calls generate_substrates_order() Arguments: nrps_pks_features: all NRPS/PKS features within the record consensus_predictions: a dictionary mapping each NRPS/PKS domain name to its prediction record: the Record being analysed Returns: a dictionary mapping cluster number to a tuple of prediction string and whether docking domain analysis was used for the prediction """ compound_predictions = {} # type: Dict[int, Tuple[str, bool]] # Find NRPS/PKS gene clusters nrpspksclusters = [ cluster for cluster in record.get_clusters() if "nrps" in cluster.products or "pks" in "-".join(cluster.products) ] if not nrpspksclusters: return {} # Predict biosynthetic gene order in gene cluster using starter domains, # thioesterase domains, gene order and docking domains for cluster in nrpspksclusters: cluster_number = cluster.get_cluster_number() cds_in_cluster = [ gene for gene in nrps_pks_features if gene.overlaps_with(cluster) ] if not cds_in_cluster: continue pks_count, nrps_count, hybrid_count = find_cluster_modular_enzymes( cds_in_cluster) # If more than three PKS cds features, use dock_dom_analysis if possible to identify order if 3 < pks_count < 11 and not nrps_count and not hybrid_count: logging.debug( "Cluster %d monomer ordering method: domain docking analysis", cluster_number) geneorder = perform_docking_domain_analysis(cds_in_cluster) docking = True else: logging.debug("Cluster %d monomer ordering method: colinear", cluster_number) geneorder = find_colinear_order(cds_in_cluster) docking = False prediction = generate_substrates_order(geneorder, consensus_predictions) compound_predictions[cluster_number] = (prediction, docking) return compound_predictions
def from_json(json: Dict[str, Any], record: Record) -> "CDSResults": """ Constructs a CDSResults instance from a JSON representation """ domains = [] for json_domain in json["domains"]: domains.append(SecMetQualifier.Domain.from_json(json_domain)) cds = record.get_cds_by_name(json["cds_name"]) definition_domains = { key: set(val) for key, val in json["definition_domains"].items() } return CDSResults(cds, domains, definition_domains)
def annotate_domains(record: Record) -> None: """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of each feature will be updated, along with creating CDSMotif features when relevant. Arguments: record: the secmet.Record of which to annotate CDS features Returns: None """ cds_within_clusters = record.get_cds_features_within_clusters() assert cds_within_clusters # because every cluster should have genes fasta = get_fasta_from_features(cds_within_clusters) cds_domains = find_domains(fasta, record) cds_motifs = find_ab_motifs(fasta) for cds in cds_within_clusters: cds_name = cds.get_name() # gather domains and classify domains = cds_domains.get(cds_name) if not domains: continue domain_type = classify_feature([domain.hit_id for domain in domains]) cds.nrps_pks.type = domain_type for domain in domains: cds.nrps_pks.add_domain(domain) # construct motif features motifs = cds_motifs.get(cds_name) if not motifs: continue motif_features = generate_motif_features(record, cds, motifs) for motif in motif_features: record.add_cds_motif(motif) cds.motifs.extend(motif_features)
def get_cds_lengths(record: secmet.Record) -> Dict[str, int]: """ Calculates the lengths of each CDS feature in a Record. Arguments: record: the Record to gather CDS features from Returns: a dictionary mapping CDS accession to length of the CDS """ lengths = {} for cds in record.get_cds_features(): lengths[cds.get_accession()] = len(cds.translation) return lengths
def parse_all_wrapper(coverage_threshold, ident_threshold): clusters_by_number, queries_by_number = core.parse_all_clusters(self.sample_data, Record(), coverage_threshold, ident_threshold) # make sure we only found one cluster number self.assertEqual(len(clusters_by_number), 1) self.assertEqual(list(clusters_by_number), [24]) self.assertEqual(len(queries_by_number), 1) self.assertEqual(list(queries_by_number), [24]) # now test the values of those queries queries = queries_by_number[24] clusters = clusters_by_number[24] return queries, clusters
def from_json(json: Dict[str, Any], record: secmet.Record) -> "ASFResults": if ASFResults.schema_version != json.pop("schema version", None): logging.warning("Dropping ASF results, schema version has changed") return None if record.id != json.pop("record id", None): raise ValueError("ASF results contained mismatching record ids") pairings = [] for domain_name, labels in json["pairings"]: domain = record.get_domain_by_name(domain_name) pairings.append((domain, labels)) return ASFResults(record.id, pairings)
def add_to_record(self, record: Record) -> None: """ Adds the hits as PFAMDomains to the given record """ db_version = pfamdb.get_db_version_from_path(self.database) for i, hit in enumerate(self.hits): protein_location = FeatureLocation(hit.protein_start, hit.protein_end) pfam_feature = PFAMDomain(location_from_string(hit.location), description=hit.description, protein_location=protein_location, identifier=hit.identifier, tool=self.tool, locus_tag=hit.locus_tag) for key in [ "label", "locus_tag", "domain", "evalue", "score", "translation" ]: setattr(pfam_feature, key, getattr(hit, key)) pfam_feature.database = db_version pfam_feature.detection = "hmmscan" pfam_feature.domain_id = "{}_{}_{:04d}".format( self.tool, pfam_feature.locus_tag, i + 1) record.add_pfam_domain(pfam_feature)
def run_specific_analysis(record: Record, options: ConfigType) -> LanthiResults: """ Runs the full lanthipeptide analysis over the given record Arguments: record: the Record instance to analyse Returns: A populated LanthiResults object """ results = LanthiResults(record.id) counter = 0 for cluster in record.get_protoclusters(): if cluster.product != 'lanthipeptide': continue # find core biosynthetic enzyme locations core_domain_names = { 'Lant_dehydr_N', 'Lant_dehydr_C', 'DUF4135', 'Pkinase' } core_genes = [] for gene in cluster.cds_children: if not gene.sec_met: continue # We seem to hit Lant_dehydr_C on some O-Methyltranferases that also hit PCMT if 'PCMT' in gene.sec_met.domain_ids: continue if core_domain_names.intersection(set(gene.sec_met.domain_ids)): core_genes.append(gene) precursor_candidates = find_lan_a_features(cluster) # Find candidate ORFs that are not yet annotated extra_orfs = all_orfs.find_all_orfs(record, cluster) for orf in extra_orfs: if len(orf.translation) < 80: precursor_candidates.append(orf) for gene in core_genes: neighbours = find_neighbours_in_range(gene, precursor_candidates) if not neighbours: continue run_lanthi_on_genes(record, gene, cluster, neighbours, results) # Analyze the cluster with RREfinder counter += 1 name = '%s_%s_%s' % (record.id, cluster.product, counter) RRE_main(cluster, results, name, options) logging.debug("Lanthipeptide module marked %d motifs", sum(map(len, results.motifs_by_locus))) return results
def add_to_record(self, record: Record) -> None: """ Save substrate specificity predictions in NRPS/PKS domain sec_met info of record """ for candidate_cluster_preds in self.region_predictions.values(): for cluster_pred in candidate_cluster_preds: assert isinstance( cluster_pred, CandidateClusterPrediction), type(cluster_pred) candidate = record.get_candidate_cluster( cluster_pred.candidate_cluster_number) candidate.smiles_structure = cluster_pred.smiles for cds_feature in record.get_nrps_pks_cds_features(): assert cds_feature.region, "CDS parent region removed since analysis" nrps_qualifier = cds_feature.nrps_pks for domain in nrps_qualifier.domains: feature = record.get_domain_by_name(domain.feature_name) assert isinstance(feature, AntismashDomain) domain.predictions.clear() if domain.name in ["AMP-binding", "A-OX"]: self._annotate_a_domain(domain) elif domain.name == "PKS_AT": self._annotate_at_domain( domain, "transatpks" in cds_feature.region.products) elif domain.name == "CAL_domain": self._annotate_cal_domain(domain) elif domain.name == "PKS_KR": self._annotate_kr_domain(domain) # otherwise one of many without prediction methods/relevance (PCP, Cglyc, etc) for method, pred in domain.predictions.items(): feature.specificity.append("%s: %s" % (method, pred)) mapping = DOMAIN_TYPE_MAPPING.get(domain.name) if mapping: feature.domain_subtype = domain.name feature.domain = mapping
def run_on_record(record: Record, results: Optional[SMCOGResults], options: ConfigType) -> SMCOGResults: """ Classifies gene functions and, if requested, generates phylogeny trees of the classifications """ relative_output_dir = os.path.relpath( os.path.join(options.output_dir, "smcogs"), os.getcwd()) smcogs_dir = os.path.abspath(relative_output_dir) if not os.path.exists(smcogs_dir): os.mkdir(smcogs_dir) if not results: results = SMCOGResults(record.id) genes = record.get_cds_features_within_clusters() hmm_results = classify_genes(genes) for gene in genes: gene_name = gene.get_name() hits = hmm_results.get(gene_name) if not hits: continue results.best_hits[gene.get_name()] = hits[0] write_smcogs_file(hmm_results, genes, record.get_nrps_pks_cds_features(), options) if not results.tree_images and options.smcogs_trees: # create the smcogs output directory if required results.relative_tree_path = relative_output_dir original_dir = os.getcwd() os.chdir(smcogs_dir) # TODO make a context manager nrpspks_genes = record.get_nrps_pks_cds_features() nrpspks_genes = [] results.tree_images = generate_trees(smcogs_dir, hmm_results, genes, nrpspks_genes) os.chdir(original_dir) return results
def generate_pfam2go_tooltip(record: Record, feature: CDSFeature) -> List[html_renderer.Markup]: """Create tooltip text for Pfam to Gene Ontologies results.""" go_notes = [] unique_pfams_with_gos = {} go_url = 'http://amigo.geneontology.org/amigo/term/' go_info_line = "{pf_id}: <a class='external-link' href='{url}{go_id}' target='_blank'>{go_id}</a>: {go_desc}" for pfam in record.get_pfam_domains_in_cds(feature): if pfam.gene_ontologies: pfam_id = pfam.full_identifier unique_pfams_with_gos[pfam_id] = pfam.gene_ontologies for unique_id, go_qualifier in sorted(unique_pfams_with_gos.items()): for go_id, go_description in sorted(go_qualifier.go_entries.items()): go_notes.append(go_info_line.format(pf_id=unique_id, url=go_url, go_id=go_id, go_desc=go_description)) return list(map(html_renderer.Markup, go_notes))
def convert_record(record: secmet.Record, fasta: IO, skip_contig_edge: bool = True) -> Dict[str, Any]: result = { "regions": [], "cds_mapping": {}, } # type: Dict[str, Any] cds_index = Counter() for region in record.get_regions(): if skip_contig_edge and region.contig_edge: continue result["regions"].append( convert_region(region, result["cds_mapping"], cds_index, fasta)) return result
def blastparse(blasttext: str, record: secmet.Record, min_seq_coverage: float = -1., min_perc_identity: float = -1.) -> Tuple[Dict[str, Query], Dict[str, List[Query]]]: """ Parses blast output into a usable form, limiting to a single best hit for every query. Results can be further trimmed by minimum thresholds of both coverage and percent identity. Arguments: blasttext: the output from diamond in blast format record: used to get all gene ids in the cluster, and used as a backup to fetch sequence length if missing from seqlengths min_seq_coverage: the exclusive lower bound of sequence coverage for a match min_perc_identity: the exclusive lower bound of identity similarity for a match Returns: a tuple of a dictionary mapping query id to Query instance a dictionary mapping cluster number to a list of Query instances from that cluster """ seqlengths = get_cds_lengths(record) names = set(cds.get_name() for cds in record.get_cds_features_within_clusters()) queries = OrderedDict() # type: Dict[str, Query] clusters = OrderedDict() # type: Dict[str, List[Query]] blastlines = remove_duplicate_hits([line.split("\t") for line in blasttext.rstrip().split("\n")]) current_query = None for tabs in blastlines: query = tabs[0] subject = parse_subject(tabs, seqlengths, names, record) # only process the pairing if limits met if subject.perc_ident <= min_perc_identity \ or subject.perc_coverage <= min_seq_coverage: continue new_query = query not in queries new_hit = subject.genecluster not in clusters if new_query: current_query = Query(query, len(queries)) queries[query] = current_query if new_hit: clusters[subject.genecluster] = [] clusters[subject.genecluster].append(current_query) # link the subject to the query current_query.add_subject(subject) return queries, clusters
def specific_analysis(record: secmet.Record) -> SactiResults: """ Analyse each sactipeptide cluster and find precursors within it. If an unannotated ORF would contain the precursor, it will be annotated. Arguments: record: the Record to analyse Returns: a SactiResults instance holding all found precursors and new ORFs """ results = SactiResults(record.id) new_feature_hits = 0 motif_count = 0 for cluster in record.get_protoclusters(): if cluster.product != 'sactipeptide': continue # Find candidate ORFs that are not yet annotated new_orfs = all_orfs.find_all_orfs(record, cluster) hmm_results = run_non_biosynthetic_phmms( fasta.get_fasta_from_features(new_orfs)) annotate_orfs(new_orfs, hmm_results) # Get all CDS features to evaluate for RiPP-likeness candidates = list(cluster.cds_children) + new_orfs domains = get_detected_domains(cluster) # Evaluate each candidate precursor peptide for candidate in candidates: motif = run_sactipred(cluster, candidate, domains) if motif is None: continue results.motifs_by_locus[candidate.get_name()].append(motif) motif_count += 1 results.clusters[cluster.get_protocluster_number()].add( candidate.get_name()) # track new CDSFeatures if found with all_orfs if candidate.region is None: results.new_cds_features.add(candidate) new_feature_hits += 1 if not motif_count: logging.debug("Found no sactipeptide motifs") else: verb = "is" if new_feature_hits == 1 else "are" logging.debug( "Found %d sactipeptide motif(s) in %d feature(s), %d of which %s new", motif_count, len(results.motifs_by_locus), new_feature_hits, verb) return results
def ensure_cds_info(single_entry: bool, genefinding: Callable[[Record, Any], None], sequence: Record) -> Record: """ Ensures the given record has CDS features with unique locus tags. CDS features are retrieved from GFF file or via genefinding, depending on antismash options. Records without CDS features will have their skip flag marked. Arguments: single_entry: whether gff_parser can ignore mismatching record ids provided there's only one record provided here and in the GFF file genefinding: the relevant run_on_record(record, options) function to use for finding genes if no GFF file being used record: the Record instance to ensure CDS features for Returns: the Record instance provided """ options = get_config() if sequence.skip: return sequence if not sequence.get_cds_features(): if options.genefinding_gff3: logging.info("No CDS features found in record %r but GFF3 file provided, running GFF parser.", sequence.id) gff_parser.run(sequence, single_entry, options) if not sequence.get_cds_features(): logging.warning("Record %s has no genes even after running GFF parser, skipping.", sequence.id) sequence.skip = "No genes found" return sequence elif options.genefinding_tool != "none": logging.info("No CDS features found in record %r, running gene finding.", sequence.id) genefinding(sequence, options) if not sequence.get_cds_features(): logging.info("No genes found, skipping record") sequence.skip = "No genes found" return sequence return sequence
def specific_analysis(record: Record, results: NRPS_PKS_Results, options: ConfigType) -> NRPS_PKS_Results: """ Runs the various NRPS/PKS analyses on a record and returns their results """ nrps_pks_genes = record.get_nrps_pks_cds_features() if not nrps_pks_genes: logging.debug("No NRPS or PKS genes found, skipping analysis") return results a_domains = get_a_domains_from_cds_features(record, nrps_pks_genes) if a_domains: logging.info( "Predicting A domain substrate specificities with NRPSPredictor2") results.add_method_results("NRPSPredictor2", run_nrpspredictor(a_domains, options)) #TODO: add call to run_siderophore_predictions pks_results = run_pks_substr_spec_predictions(nrps_pks_genes) for method, method_results in pks_results.items(): results.add_method_results(method, method_results) consensus_pair = calculate_consensus_prediction(nrps_pks_genes, results.domain_predictions) results.consensus, results.consensus_transat = consensus_pair #TODO: add call to run_lipopptide_predictions candidate_cluster_predictions = analyse_biosynthetic_order( nrps_pks_genes, results.consensus, record) for prediction in candidate_cluster_predictions: candidate_cluster = record.get_candidate_cluster( prediction.candidate_cluster_number) region = candidate_cluster.parent assert isinstance(region, Region), type(region) results.region_predictions[region.get_region_number()].append( prediction) return results